debuggers.hg
changeset 17978:08f77df14cba
merge with xen-unstable.hg
author | Isaku Yamahata <yamahata@valinux.co.jp> |
---|---|
date | Wed Jul 02 11:30:37 2008 +0900 (2008-07-02) |
parents | 11318234588e 19970181d6a4 |
children | 40e7329105fa ac8bc814faba |
files | tools/libxc/ia64/xc_ia64_linux_restore.c tools/libxc/ia64/xc_ia64_linux_save.c |
line diff
1.1 --- a/docs/ChangeLog Thu Jun 19 12:48:04 2008 +0900 1.2 +++ b/docs/ChangeLog Wed Jul 02 11:30:37 2008 +0900 1.3 @@ -16,6 +16,15 @@ http://lists.xensource.com/archives/html 1.4 Xen 3.3 release 1.5 --------------- 1.6 1.7 +17903: Add greater than 16 xvd device availability 1.8 +http://xenbits.xensource.com/xen-unstable.hg?rev/0728459b3c8d 1.9 + 1.10 +The tools can now attach a disk of the form: 1.11 +(1<<28) | (device<<8) | partition 1.12 +to support many more xvd disks and up to 256 partitions. 1.13 +The linux guest frontend has been expanded to support 1.14 +this new construct, while legacy guests should just ignore it. 1.15 + 1.16 17538: Add XENPF_set_processor_pminfo 1.17 http://xenbits.xensource.com/xen-unstable.hg?rev/5bb9093eb0e9 1.18
2.1 --- a/extras/mini-os/arch/x86/mm.c Thu Jun 19 12:48:04 2008 +0900 2.2 +++ b/extras/mini-os/arch/x86/mm.c Wed Jul 02 11:30:37 2008 +0900 2.3 @@ -528,18 +528,13 @@ void *map_frames_ex(unsigned long *f, un 2.4 2.5 static void clear_bootstrap(void) 2.6 { 2.7 - xen_pfn_t mfns[] = { virt_to_mfn(&shared_info) }; 2.8 - int n = sizeof(mfns)/sizeof(*mfns); 2.9 pte_t nullpte = { }; 2.10 2.11 /* Use first page as the CoW zero page */ 2.12 memset(&_text, 0, PAGE_SIZE); 2.13 - mfn_zero = pfn_to_mfn((unsigned long) &_text); 2.14 - if (HYPERVISOR_update_va_mapping((unsigned long) &_text, nullpte, UVMF_INVLPG)) 2.15 - printk("Unable to unmap first page\n"); 2.16 - 2.17 - if (free_physical_pages(mfns, n) != n) 2.18 - printk("Unable to free bootstrap pages\n"); 2.19 + mfn_zero = virt_to_mfn((unsigned long) &_text); 2.20 + if (HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG)) 2.21 + printk("Unable to unmap NULL page\n"); 2.22 } 2.23 2.24 void arch_init_p2m(unsigned long max_pfn)
3.1 --- a/extras/mini-os/blkfront.c Thu Jun 19 12:48:04 2008 +0900 3.2 +++ b/extras/mini-os/blkfront.c Wed Jul 02 11:30:37 2008 +0900 3.3 @@ -125,7 +125,6 @@ struct blkfront_dev *init_blkfront(char 3.4 3.5 dev->events = NULL; 3.6 3.7 - // FIXME: proper frees on failures 3.8 again: 3.9 err = xenbus_transaction_start(&xbt); 3.10 if (err) {
4.1 --- a/extras/mini-os/fbfront.c Thu Jun 19 12:48:04 2008 +0900 4.2 +++ b/extras/mini-os/fbfront.c Wed Jul 02 11:30:37 2008 +0900 4.3 @@ -100,7 +100,6 @@ struct kbdfront_dev *init_kbdfront(char 4.4 s->in_cons = s->in_prod = 0; 4.5 s->out_cons = s->out_prod = 0; 4.6 4.7 - // FIXME: proper frees on failures 4.8 again: 4.9 err = xenbus_transaction_start(&xbt); 4.10 if (err) { 4.11 @@ -408,7 +407,6 @@ struct fbfront_dev *init_fbfront(char *n 4.12 s->pd[i] = 0; 4.13 4.14 4.15 - // FIXME: proper frees on failures 4.16 again: 4.17 err = xenbus_transaction_start(&xbt); 4.18 if (err) {
5.1 --- a/extras/mini-os/fs-front.c Thu Jun 19 12:48:04 2008 +0900 5.2 +++ b/extras/mini-os/fs-front.c Wed Jul 02 11:30:37 2008 +0900 5.3 @@ -136,8 +136,8 @@ static inline void add_id_to_freelist(un 5.4 again: 5.5 old_id = freelist[0]; 5.6 /* Note: temporal inconsistency, since freelist[0] can be changed by someone 5.7 - * else, but we are a sole owner of freelist[id], it's OK. */ 5.8 - freelist[id] = old_id; 5.9 + * else, but we are a sole owner of freelist[id + 1], it's OK. */ 5.10 + freelist[id + 1] = old_id; 5.11 new_id = id; 5.12 if(cmpxchg(&freelist[0], old_id, new_id) != old_id) 5.13 { 5.14 @@ -154,7 +154,7 @@ static inline unsigned short get_id_from 5.15 5.16 again: 5.17 old_id = freelist[0]; 5.18 - new_id = freelist[old_id]; 5.19 + new_id = freelist[old_id + 1]; 5.20 if(cmpxchg(&freelist[0], old_id, new_id) != old_id) 5.21 { 5.22 printk("Cmpxchg on freelist remove failed.\n"); 5.23 @@ -785,8 +785,8 @@ static void alloc_request_table(struct f 5.24 printk("Allocating request array for import %d, nr_entries = %d.\n", 5.25 import->import_id, import->nr_entries); 5.26 requests = xmalloc_array(struct fs_request, import->nr_entries); 5.27 - import->freelist = xmalloc_array(unsigned short, import->nr_entries); 5.28 - memset(import->freelist, 0, sizeof(unsigned short) * import->nr_entries); 5.29 + import->freelist = xmalloc_array(unsigned short, import->nr_entries + 1); 5.30 + memset(import->freelist, 0, sizeof(unsigned short) * (import->nr_entries + 1)); 5.31 for(i=0; i<import->nr_entries; i++) 5.32 { 5.33 /* TODO: that's a lot of memory */
6.1 --- a/extras/mini-os/lib/sys.c Thu Jun 19 12:48:04 2008 +0900 6.2 +++ b/extras/mini-os/lib/sys.c Wed Jul 02 11:30:37 2008 +0900 6.3 @@ -686,7 +686,7 @@ static int select_poll(int nfds, fd_set 6.4 #ifdef LIBC_VERBOSE 6.5 static int nb; 6.6 static int nbread[NOFILE], nbwrite[NOFILE], nbexcept[NOFILE]; 6.7 - static s64_t lastshown; 6.8 + static s_time_t lastshown; 6.9 6.10 nb++; 6.11 #endif
7.1 --- a/extras/mini-os/netfront.c Thu Jun 19 12:48:04 2008 +0900 7.2 +++ b/extras/mini-os/netfront.c Wed Jul 02 11:30:37 2008 +0900 7.3 @@ -38,7 +38,7 @@ struct net_buffer { 7.4 struct netfront_dev { 7.5 domid_t dom; 7.6 7.7 - unsigned short tx_freelist[NET_TX_RING_SIZE]; 7.8 + unsigned short tx_freelist[NET_TX_RING_SIZE + 1]; 7.9 struct semaphore tx_sem; 7.10 7.11 struct net_buffer rx_buffers[NET_RX_RING_SIZE]; 7.12 @@ -70,14 +70,14 @@ void init_rx_buffers(struct netfront_dev 7.13 7.14 static inline void add_id_to_freelist(unsigned int id,unsigned short* freelist) 7.15 { 7.16 - freelist[id] = freelist[0]; 7.17 + freelist[id + 1] = freelist[0]; 7.18 freelist[0] = id; 7.19 } 7.20 7.21 static inline unsigned short get_id_from_freelist(unsigned short* freelist) 7.22 { 7.23 unsigned int id = freelist[0]; 7.24 - freelist[0] = freelist[id]; 7.25 + freelist[0] = freelist[id + 1]; 7.26 return id; 7.27 } 7.28
8.1 --- a/stubdom/grub.patches/99minios Thu Jun 19 12:48:04 2008 +0900 8.2 +++ b/stubdom/grub.patches/99minios Wed Jul 02 11:30:37 2008 +0900 8.3 @@ -832,7 +832,18 @@ Index: grub/stage2/fsys_iso9660.c 8.4 Index: grub/stage2/fsys_reiserfs.c 8.5 =================================================================== 8.6 --- grub.orig/stage2/fsys_reiserfs.c 2008-06-16 15:18:03.410933000 +0100 8.7 -+++ grub/stage2/fsys_reiserfs.c 2008-06-16 15:18:14.786009000 +0100 8.8 ++++ grub/stage2/fsys_reiserfs.c 2008-06-20 18:33:52.002100000 +0100 8.9 +@@ -224,8 +224,8 @@ 8.10 + 8.11 + struct disk_child 8.12 + { 8.13 +- unsigned long dc_block_number; /* Disk child's block number. */ 8.14 +- unsigned short dc_size; /* Disk child's used space. */ 8.15 ++ __u32 dc_block_number; /* Disk child's block number. */ 8.16 ++ __u16 dc_size; /* Disk child's used space. */ 8.17 + }; 8.18 + 8.19 + #define DC_SIZE (sizeof (struct disk_child)) 8.20 @@ -369,7 +369,14 @@ 8.21 static __inline__ unsigned long 8.22 log2 (unsigned long word)
9.1 --- a/stubdom/grub/Makefile Thu Jun 19 12:48:04 2008 +0900 9.2 +++ b/stubdom/grub/Makefile Wed Jul 02 11:30:37 2008 +0900 9.3 @@ -5,7 +5,7 @@ vpath %.c ../grub-cvs 9.4 9.5 BOOT=boot-$(XEN_TARGET_ARCH).o 9.6 9.7 -DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I. 9.8 +DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I$(XEN_ROOT)/tools/include -I. 9.9 DEF_CPPFLAGS += -I../grub-cvs/stage1 9.10 DEF_CPPFLAGS += -I../grub-cvs/stage2 9.11 DEF_CPPFLAGS += -I../grub-cvs/netboot
10.1 --- a/tools/blktap/drivers/Makefile Thu Jun 19 12:48:04 2008 +0900 10.2 +++ b/tools/blktap/drivers/Makefile Wed Jul 02 11:30:37 2008 +0900 10.3 @@ -17,8 +17,16 @@ CFLAGS += -D_GNU_SOURCE 10.4 CFLAGS += -Wp,-MD,.$(@F).d 10.5 DEPS = .*.d 10.6 10.7 +ifeq ($(shell . ./check_gcrypt),"yes") 10.8 +CFLAGS += -DUSE_GCRYPT 10.9 +CRYPT_LIB := -lgcrypt 10.10 +else 10.11 +CRYPT_LIB := -lcrypto 10.12 +$(warning *** libgcrypt not installed: falling back to libcrypto ***) 10.13 +endif 10.14 + 10.15 LDFLAGS_blktapctrl := $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenstore) -L../lib -lblktap 10.16 -LDFLAGS_img := $(LIBAIO_DIR)/libaio.a -lcrypto -lpthread -lz 10.17 +LDFLAGS_img := $(LIBAIO_DIR)/libaio.a $(CRYPT_LIB) -lpthread -lz 10.18 10.19 BLK-OBJS-y := block-aio.o 10.20 BLK-OBJS-y += block-sync.o
11.1 --- a/tools/blktap/drivers/blktapctrl.c Thu Jun 19 12:48:04 2008 +0900 11.2 +++ b/tools/blktap/drivers/blktapctrl.c Wed Jul 02 11:30:37 2008 +0900 11.3 @@ -127,7 +127,7 @@ static int get_new_dev(int *major, int * 11.4 char *devname; 11.5 11.6 tr.domid = blkif->domid; 11.7 - tr.busid = (unsigned short)blkif->be_id; 11.8 + tr.busid = blkif->be_id; 11.9 ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF, tr ); 11.10 11.11 if ( (ret <= 0)||(ret > MAX_TAP_DEV) ) {
12.1 --- a/tools/blktap/drivers/block-qcow.c Thu Jun 19 12:48:04 2008 +0900 12.2 +++ b/tools/blktap/drivers/block-qcow.c Wed Jul 02 11:30:37 2008 +0900 12.3 @@ -33,7 +33,6 @@ 12.4 #include <zlib.h> 12.5 #include <inttypes.h> 12.6 #include <libaio.h> 12.7 -#include <openssl/md5.h> 12.8 #include "bswap.h" 12.9 #include "aes.h" 12.10 #include "tapdisk.h" 12.11 @@ -146,6 +145,35 @@ struct tdqcow_state { 12.12 12.13 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset); 12.14 12.15 +#ifdef USE_GCRYPT 12.16 + 12.17 +#include <gcrypt.h> 12.18 + 12.19 +static uint32_t gen_cksum(char *ptr, int len) 12.20 +{ 12.21 + int i; 12.22 + uint32_t md[4]; 12.23 + 12.24 + /* Convert L1 table to big endian */ 12.25 + for(i = 0; i < len / sizeof(uint64_t); i++) { 12.26 + cpu_to_be64s(&((uint64_t*) ptr)[i]); 12.27 + } 12.28 + 12.29 + /* Generate checksum */ 12.30 + gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len); 12.31 + 12.32 + /* Convert L1 table back to native endianess */ 12.33 + for(i = 0; i < len / sizeof(uint64_t); i++) { 12.34 + be64_to_cpus(&((uint64_t*) ptr)[i]); 12.35 + } 12.36 + 12.37 + return md[0]; 12.38 +} 12.39 + 12.40 +#else /* use libcrypto */ 12.41 + 12.42 +#include <openssl/md5.h> 12.43 + 12.44 static uint32_t gen_cksum(char *ptr, int len) 12.45 { 12.46 int i; 12.47 @@ -153,9 +181,8 @@ static uint32_t gen_cksum(char *ptr, int 12.48 uint32_t ret; 12.49 12.50 md = malloc(MD5_DIGEST_LENGTH); 12.51 + if(!md) return 0; 12.52 12.53 - if(!md) return 0; 12.54 - 12.55 /* Convert L1 table to big endian */ 12.56 for(i = 0; i < len / sizeof(uint64_t); i++) { 12.57 cpu_to_be64s(&((uint64_t*) ptr)[i]); 12.58 @@ -176,6 +203,8 @@ static uint32_t gen_cksum(char *ptr, int 12.59 return ret; 12.60 } 12.61 12.62 +#endif 12.63 + 12.64 static int get_filesize(char *filename, uint64_t *size, struct stat *st) 12.65 { 12.66 int fd;
13.1 --- a/tools/blktap/drivers/block-qcow2.c Thu Jun 19 12:48:04 2008 +0900 13.2 +++ b/tools/blktap/drivers/block-qcow2.c Wed Jul 02 11:30:37 2008 +0900 13.3 @@ -254,10 +254,7 @@ static int bdrv_pread(int fd, int64_t of 13.4 */ 13.5 static int bdrv_pwrite(int fd, int64_t offset, const void *buf, int count) 13.6 { 13.7 - int ret; 13.8 - 13.9 - ret = lseek(fd, offset, SEEK_SET); 13.10 - if (ret != offset) { 13.11 + if (lseek(fd, offset, SEEK_SET) == -1) { 13.12 DPRINTF("bdrv_pwrite failed seek (%#"PRIx64").\n", offset); 13.13 return -1; 13.14 }
14.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 14.2 +++ b/tools/blktap/drivers/check_gcrypt Wed Jul 02 11:30:37 2008 +0900 14.3 @@ -0,0 +1,14 @@ 14.4 +#!/bin/sh 14.5 + 14.6 +cat > .gcrypt.c << EOF 14.7 +#include <gcrypt.h> 14.8 +int main(void) { return 0; } 14.9 +EOF 14.10 + 14.11 +if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then 14.12 + echo "yes" 14.13 +else 14.14 + echo "no" 14.15 +fi 14.16 + 14.17 +rm -f .gcrypt*
15.1 --- a/tools/blktap/lib/blktaplib.h Thu Jun 19 12:48:04 2008 +0900 15.2 +++ b/tools/blktap/lib/blktaplib.h Wed Jul 02 11:30:37 2008 +0900 15.3 @@ -161,7 +161,7 @@ typedef struct tapdev_info { 15.4 15.5 typedef struct domid_translate { 15.6 unsigned short domid; 15.7 - unsigned short busid; 15.8 + uint32_t busid; 15.9 } domid_translate_t ; 15.10 15.11 typedef struct image {
16.1 --- a/tools/debugger/xenitp/xenitp.c Thu Jun 19 12:48:04 2008 +0900 16.2 +++ b/tools/debugger/xenitp/xenitp.c Wed Jul 02 11:30:37 2008 +0900 16.3 @@ -58,6 +58,16 @@ static int cur_vcpu; 16.4 16.5 int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr); 16.6 16.7 +/* wrapper for vcpu_gest_context_any_t */ 16.8 +static int xc_ia64_vcpu_getcontext(int xc_handle, 16.9 + uint32_t domid, 16.10 + uint32_t vcpu, 16.11 + vcpu_guest_context_t *ctxt) 16.12 +{ 16.13 + return xc_vcpu_getcontext(xc_handle, domid, vcpu, 16.14 + (vcpu_guest_context_any_t *)ctxt); 16.15 +} 16.16 + 16.17 static inline unsigned int ctx_slot (vcpu_guest_context_t *ctx) 16.18 { 16.19 return (ctx->regs.psr >> PSR_RI_SHIFT) & 3; 16.20 @@ -729,7 +739,7 @@ int wait_domain (int vcpu, vcpu_guest_co 16.21 fflush (stdout); 16.22 nanosleep (&ts, NULL); 16.23 } 16.24 - return xc_vcpu_getcontext (xc_handle, domid, vcpu, ctx); 16.25 + return xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, ctx); 16.26 } 16.27 16.28 int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr) 16.29 @@ -945,13 +955,13 @@ char *parse_arg (char **buf) 16.30 return res; 16.31 } 16.32 16.33 -vcpu_guest_context_t vcpu_ctx[MAX_VIRT_CPUS]; 16.34 +vcpu_guest_context_any_t vcpu_ctx_any[MAX_VIRT_CPUS]; 16.35 16.36 int vcpu_setcontext (int vcpu) 16.37 { 16.38 int ret; 16.39 16.40 - ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx[vcpu]); 16.41 + ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx_any[vcpu]); 16.42 if (ret < 0) 16.43 perror ("xc_vcpu_setcontext"); 16.44 16.45 @@ -1518,7 +1528,7 @@ enum cmd_status do_command (int vcpu, ch 16.46 int flag_ambiguous; 16.47 16.48 cur_vcpu = vcpu; 16.49 - cur_ctx = &vcpu_ctx[vcpu]; 16.50 + cur_ctx = &vcpu_ctx_any[vcpu].c; 16.51 16.52 /* Handle repeat last-command. */ 16.53 if (*line == 0) { 16.54 @@ -1575,7 +1585,7 @@ void xenitp (int vcpu) 16.55 int ret; 16.56 struct sigaction sa; 16.57 16.58 - cur_ctx = &vcpu_ctx[vcpu]; 16.59 + cur_ctx = &vcpu_ctx_any[vcpu].c; 16.60 16.61 xc_handle = xc_interface_open (); /* for accessing control interface */ 16.62 16.63 @@ -1588,9 +1598,9 @@ void xenitp (int vcpu) 16.64 exit (-1); 16.65 } 16.66 16.67 - ret = xc_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx); 16.68 + ret = xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx); 16.69 if (ret < 0) { 16.70 - perror ("xc_vcpu_getcontext"); 16.71 + perror ("xc_ia64_vcpu_getcontext"); 16.72 exit (-1); 16.73 } 16.74
17.1 --- a/tools/examples/xend-config.sxp Thu Jun 19 12:48:04 2008 +0900 17.2 +++ b/tools/examples/xend-config.sxp Wed Jul 02 11:30:37 2008 +0900 17.3 @@ -242,3 +242,6 @@ 17.4 17.5 # Script to run when the label of a resource has changed. 17.6 #(resource-label-change-script '') 17.7 + 17.8 +# Rotation count of qemu-dm log file. 17.9 +#(qemu-dm-logrotate-count 10)
18.1 --- a/tools/firmware/hvmloader/hvmloader.c Thu Jun 19 12:48:04 2008 +0900 18.2 +++ b/tools/firmware/hvmloader/hvmloader.c Wed Jul 02 11:30:37 2008 +0900 18.3 @@ -206,10 +206,12 @@ static void pci_setup(void) 18.4 pci_writew(devfn, 0x3d, 0x0001); 18.5 break; 18.6 case 0x0101: 18.7 - /* PIIX3 IDE */ 18.8 - ASSERT((vendor_id == 0x8086) && (device_id == 0x7010)); 18.9 - pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */ 18.10 - pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */ 18.11 + if ( vendor_id == 0x8086 ) 18.12 + { 18.13 + /* Intel ICHs since PIIX3: enable IDE legacy mode. */ 18.14 + pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */ 18.15 + pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */ 18.16 + } 18.17 break; 18.18 } 18.19
19.1 --- a/tools/firmware/rombios/rombios.c Thu Jun 19 12:48:04 2008 +0900 19.2 +++ b/tools/firmware/rombios/rombios.c Wed Jul 02 11:30:37 2008 +0900 19.3 @@ -9783,6 +9783,27 @@ smbios_init: 19.4 19.5 #endif 19.6 19.7 +#if BX_TCGBIOS 19.8 +; The section between the POST entry and the NMI entry is filling up 19.9 +; and causes crashes if this code was directly there 19.10 +tcpa_post_part1: 19.11 + call _tcpa_acpi_init 19.12 + 19.13 + push dword #0 19.14 + call _tcpa_initialize_tpm 19.15 + add sp, #4 19.16 + 19.17 + call _tcpa_do_measure_POSTs 19.18 + call _tcpa_wake_event /* specs: 3.2.3.7 */ 19.19 + ret 19.20 + 19.21 +tcpa_post_part2: 19.22 + call _tcpa_calling_int19h /* specs: 8.2.3 step 1 */ 19.23 + call _tcpa_add_event_separators /* specs: 8.2.3 step 2 */ 19.24 + /* we do not call int 19h handler but keep following eventlog */ 19.25 + call _tcpa_returned_int19h /* specs: 8.2.3 step 3/7 */ 19.26 + ret 19.27 +#endif 19.28 19.29 19.30 ;; for 'C' strings and other data, insert them here with 19.31 @@ -10003,14 +10024,7 @@ post_default_ints: 19.32 mov 0x0410, ax 19.33 19.34 #if BX_TCGBIOS 19.35 - call _tcpa_acpi_init 19.36 - 19.37 - push dword #0 19.38 - call _tcpa_initialize_tpm 19.39 - add sp, #4 19.40 - 19.41 - call _tcpa_do_measure_POSTs 19.42 - call _tcpa_wake_event /* specs: 3.2.3.7 */ 19.43 + call tcpa_post_part1 19.44 #endif 19.45 19.46 ;; Parallel setup 19.47 @@ -10138,10 +10152,7 @@ post_default_ints: 19.48 call _interactive_bootkey 19.49 19.50 #if BX_TCGBIOS 19.51 - call _tcpa_calling_int19h /* specs: 8.2.3 step 1 */ 19.52 - call _tcpa_add_event_separators /* specs: 8.2.3 step 2 */ 19.53 - /* we do not call int 19h handler but keep following eventlog */ 19.54 - call _tcpa_returned_int19h /* specs: 8.2.3 step 3/7 */ 19.55 + call tcpa_post_part2 19.56 #endif 19.57 19.58 ;; Start the boot sequence. See the comments in int19_relocated
20.1 --- a/tools/ioemu/hw/xen_console.c Thu Jun 19 12:48:04 2008 +0900 20.2 +++ b/tools/ioemu/hw/xen_console.c Wed Jul 02 11:30:37 2008 +0900 20.3 @@ -160,16 +160,18 @@ int xs_gather(struct xs_handle *xs, cons 20.4 20.5 static int domain_create_ring(struct domain *dom) 20.6 { 20.7 - int err, remote_port, ring_ref, rc; 20.8 + int err, remote_port, ring_ref, limit, rc; 20.9 20.10 err = xs_gather(dom->xsh, dom->serialpath, 20.11 "ring-ref", "%u", &ring_ref, 20.12 "port", "%i", &remote_port, 20.13 + "limit", "%i", &limit, 20.14 NULL); 20.15 if (err) { 20.16 err = xs_gather(dom->xsh, dom->conspath, 20.17 "ring-ref", "%u", &ring_ref, 20.18 "port", "%i", &remote_port, 20.19 + "limit", "%i", &limit, 20.20 NULL); 20.21 if (err) { 20.22 fprintf(stderr, "Console: failed to find ring-ref/port yet\n"); 20.23 @@ -178,7 +180,9 @@ static int domain_create_ring(struct dom 20.24 dom->use_consolepath = 1; 20.25 } else 20.26 dom->use_consolepath = 0; 20.27 - fprintf(stderr, "Console: got ring-ref %d port %d\n", ring_ref, remote_port); 20.28 + dom->buffer.max_capacity = limit; 20.29 + fprintf(stderr, "Console: got ring-ref %d port %d limit %d\n", 20.30 + ring_ref, remote_port, limit); 20.31 20.32 if ((ring_ref == dom->ring_ref) && (remote_port == dom->remote_port)) 20.33 goto out;
21.1 --- a/tools/ioemu/target-i386-dm/exec-dm.c Thu Jun 19 12:48:04 2008 +0900 21.2 +++ b/tools/ioemu/target-i386-dm/exec-dm.c Wed Jul 02 11:30:37 2008 +0900 21.3 @@ -483,9 +483,11 @@ static void memcpy_words(void *dst, void 21.4 } 21.5 #endif 21.6 21.7 -void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, 21.8 - int len, int is_write) 21.9 +void cpu_physical_memory_rw(target_phys_addr_t _addr, uint8_t *buf, 21.10 + int _len, int is_write) 21.11 { 21.12 + target_phys_addr_t addr = _addr; 21.13 + int len = _len; 21.14 int l, io_index; 21.15 uint8_t *ptr; 21.16 uint32_t val; 21.17 @@ -520,6 +522,7 @@ void cpu_physical_memory_rw(target_phys_ 21.18 } else if ((ptr = phys_ram_addr(addr)) != NULL) { 21.19 /* Writing to RAM */ 21.20 memcpy_words(ptr, buf, l); 21.21 +#ifndef CONFIG_STUBDOM 21.22 if (logdirty_bitmap != NULL) { 21.23 /* Record that we have dirtied this frame */ 21.24 unsigned long pfn = addr >> TARGET_PAGE_BITS; 21.25 @@ -531,6 +534,7 @@ void cpu_physical_memory_rw(target_phys_ 21.26 |= 1UL << pfn % HOST_LONG_BITS; 21.27 } 21.28 } 21.29 +#endif 21.30 #ifdef __ia64__ 21.31 sync_icache(ptr, l); 21.32 #endif 21.33 @@ -566,6 +570,13 @@ void cpu_physical_memory_rw(target_phys_ 21.34 addr += l; 21.35 } 21.36 21.37 +#ifdef CONFIG_STUBDOM 21.38 + if (logdirty_bitmap != NULL) 21.39 + xc_hvm_modified_memory(xc_handle, domid, _addr >> TARGET_PAGE_BITS, 21.40 + (_addr + _len + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS 21.41 + - _addr >> TARGET_PAGE_BITS); 21.42 +#endif 21.43 + 21.44 mapcache_unlock(); 21.45 } 21.46 #endif
22.1 --- a/tools/ioemu/xenstore.c Thu Jun 19 12:48:04 2008 +0900 22.2 +++ b/tools/ioemu/xenstore.c Wed Jul 02 11:30:37 2008 +0900 22.3 @@ -260,8 +260,6 @@ void xenstore_parse_domain_config(int hv 22.4 /* autoguess qcow vs qcow2 */ 22.5 } else if (!strcmp(drv,"file") || !strcmp(drv,"phy")) { 22.6 format = &bdrv_raw; 22.7 - } else if (!strcmp(drv,"phy")) { 22.8 - format = &bdrv_raw; 22.9 } else { 22.10 format = bdrv_find_format(drv); 22.11 if (!format) { 22.12 @@ -404,6 +402,10 @@ void xenstore_process_logdirty_event(voi 22.13 /* No key yet: wait for the next watch */ 22.14 return; 22.15 22.16 +#ifdef CONFIG_STUBDOM 22.17 + /* We pass the writes to hypervisor */ 22.18 + seg = (void*)1; 22.19 +#else 22.20 strncpy(key_terminated, key_ascii, 16); 22.21 free(key_ascii); 22.22 key = (key_t) strtoull(key_terminated, NULL, 16); 22.23 @@ -419,11 +421,6 @@ void xenstore_process_logdirty_event(voi 22.24 fprintf(logfile, "%s: key=%16.16llx size=%lu\n", __FUNCTION__, 22.25 (unsigned long long)key, logdirty_bitmap_size); 22.26 22.27 -#ifdef CONFIG_STUBDOM 22.28 - /* XXX we just can't use shm. */ 22.29 - fprintf(logfile, "Log dirty is not implemented in stub domains!\n"); 22.30 - return; 22.31 -#else 22.32 shmid = shmget(key, 2 * logdirty_bitmap_size, S_IRUSR|S_IWUSR); 22.33 if (shmid == -1) { 22.34 fprintf(logfile, "Log-dirty: shmget failed: segment %16.16llx "
23.1 --- a/tools/libxc/ia64/xc_ia64_hvm_build.c Thu Jun 19 12:48:04 2008 +0900 23.2 +++ b/tools/libxc/ia64/xc_ia64_hvm_build.c Wed Jul 02 11:30:37 2008 +0900 23.3 @@ -1052,7 +1052,8 @@ error_out: 23.4 int 23.5 xc_hvm_build(int xc_handle, uint32_t domid, int memsize, const char *image_name) 23.6 { 23.7 - vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt; 23.8 + vcpu_guest_context_any_t st_ctxt_any; 23.9 + vcpu_guest_context_t *ctxt = &st_ctxt_any.c; 23.10 char *image = NULL; 23.11 unsigned long image_size; 23.12 unsigned long nr_pages; 23.13 @@ -1079,14 +1080,14 @@ xc_hvm_build(int xc_handle, uint32_t dom 23.14 23.15 free(image); 23.16 23.17 - memset(ctxt, 0, sizeof(*ctxt)); 23.18 + memset(&st_ctxt_any, 0, sizeof(st_ctxt_any)); 23.19 ctxt->regs.ip = 0x80000000ffffffb0UL; 23.20 ctxt->regs.ar.fpsr = xc_ia64_fpsr_default(); 23.21 ctxt->regs.cr.itir = 14 << 2; 23.22 ctxt->regs.psr = IA64_PSR_AC | IA64_PSR_BN; 23.23 ctxt->regs.cr.dcr = 0; 23.24 ctxt->regs.cr.pta = 15 << 2; 23.25 - return xc_vcpu_setcontext(xc_handle, domid, 0, ctxt); 23.26 + return xc_vcpu_setcontext(xc_handle, domid, 0, &st_ctxt_any); 23.27 23.28 error_out: 23.29 free(image);
24.1 --- a/tools/libxc/ia64/xc_ia64_linux_restore.c Thu Jun 19 12:48:04 2008 +0900 24.2 +++ b/tools/libxc/ia64/xc_ia64_linux_restore.c Wed Jul 02 11:30:37 2008 +0900 24.3 @@ -117,8 +117,9 @@ xc_ia64_recv_unallocated_list(int xc_han 24.4 24.5 static int 24.6 xc_ia64_recv_vcpu_context(int xc_handle, int io_fd, uint32_t dom, 24.7 - uint32_t vcpu, vcpu_guest_context_t *ctxt) 24.8 + uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any) 24.9 { 24.10 + vcpu_guest_context_t *ctxt = &ctxt_any->c; 24.11 if (read_exact(io_fd, ctxt, sizeof(*ctxt))) { 24.12 ERROR("Error when reading ctxt"); 24.13 return -1; 24.14 @@ -128,14 +129,14 @@ xc_ia64_recv_vcpu_context(int xc_handle, 24.15 24.16 /* Initialize and set registers. */ 24.17 ctxt->flags = VGCF_EXTRA_REGS | VGCF_SET_CR_IRR | VGCF_online; 24.18 - if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt) != 0) { 24.19 + if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt_any) != 0) { 24.20 ERROR("Couldn't set vcpu context"); 24.21 return -1; 24.22 } 24.23 24.24 /* Just a check. */ 24.25 ctxt->flags = 0; 24.26 - if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) { 24.27 + if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) { 24.28 ERROR("Could not get vcpu context"); 24.29 return -1; 24.30 } 24.31 @@ -226,19 +227,20 @@ xc_ia64_pv_recv_vcpu_context(int xc_hand 24.32 int rc = -1; 24.33 24.34 /* A copy of the CPU context of the guest. */ 24.35 - vcpu_guest_context_t ctxt; 24.36 - 24.37 - if (lock_pages(&ctxt, sizeof(ctxt))) { 24.38 + vcpu_guest_context_any_t ctxt_any; 24.39 + vcpu_guest_context_t *ctxt = &ctxt_any.c; 24.40 + 24.41 + if (lock_pages(&ctxt_any, sizeof(ctxt_any))) { 24.42 /* needed for build domctl, but might as well do early */ 24.43 ERROR("Unable to lock_pages ctxt"); 24.44 return -1; 24.45 } 24.46 24.47 - if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt)) 24.48 + if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt_any)) 24.49 goto out; 24.50 24.51 /* Then get privreg page. */ 24.52 - if (read_page(xc_handle, io_fd, dom, ctxt.privregs_pfn) < 0) { 24.53 + if (read_page(xc_handle, io_fd, dom, ctxt->privregs_pfn) < 0) { 24.54 ERROR("Could not read vcpu privregs"); 24.55 goto out; 24.56 } 24.57 @@ -441,12 +443,12 @@ xc_ia64_hvm_recv_context(int xc_handle, 24.58 /* vcpu context */ 24.59 for (i = 0; i <= info.max_vcpu_id; i++) { 24.60 /* A copy of the CPU context of the guest. */ 24.61 - vcpu_guest_context_t ctxt; 24.62 + vcpu_guest_context_any_t ctxt_any; 24.63 24.64 if (!__test_bit(i, vcpumap)) 24.65 continue; 24.66 24.67 - if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt)) 24.68 + if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any)) 24.69 goto out; 24.70 24.71 /* system context of vcpu is recieved as hvm context. */
25.1 --- a/tools/libxc/ia64/xc_ia64_linux_save.c Thu Jun 19 12:48:04 2008 +0900 25.2 +++ b/tools/libxc/ia64/xc_ia64_linux_save.c Wed Jul 02 11:30:37 2008 +0900 25.3 @@ -180,9 +180,10 @@ xc_ia64_send_unallocated_list(int xc_han 25.4 25.5 static int 25.6 xc_ia64_send_vcpu_context(int xc_handle, int io_fd, uint32_t dom, 25.7 - uint32_t vcpu, vcpu_guest_context_t *ctxt) 25.8 + uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any) 25.9 { 25.10 - if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) { 25.11 + vcpu_guest_context_t *ctxt = &ctxt_any->c; 25.12 + if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) { 25.13 ERROR("Could not get vcpu context"); 25.14 return -1; 25.15 } 25.16 @@ -269,17 +270,19 @@ xc_ia64_pv_send_context(int xc_handle, i 25.17 /* vcpu context */ 25.18 for (i = 0; i <= info->max_vcpu_id; i++) { 25.19 /* A copy of the CPU context of the guest. */ 25.20 - vcpu_guest_context_t ctxt; 25.21 + vcpu_guest_context_any_t ctxt_any; 25.22 + vcpu_guest_context_t *ctxt = &ctxt_any.c; 25.23 + 25.24 char *mem; 25.25 25.26 if (!__test_bit(i, vcpumap)) 25.27 continue; 25.28 25.29 - if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt)) 25.30 + if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any)) 25.31 goto out; 25.32 25.33 mem = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 25.34 - PROT_READ|PROT_WRITE, ctxt.privregs_pfn); 25.35 + PROT_READ|PROT_WRITE, ctxt->privregs_pfn); 25.36 if (mem == NULL) { 25.37 ERROR("cannot map privreg page"); 25.38 goto out; 25.39 @@ -337,12 +340,12 @@ xc_ia64_hvm_send_context(int xc_handle, 25.40 /* vcpu context */ 25.41 for (i = 0; i <= info->max_vcpu_id; i++) { 25.42 /* A copy of the CPU context of the guest. */ 25.43 - vcpu_guest_context_t ctxt; 25.44 + vcpu_guest_context_any_t ctxt_any; 25.45 25.46 if (!__test_bit(i, vcpumap)) 25.47 continue; 25.48 25.49 - if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt)) 25.50 + if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any)) 25.51 goto out; 25.52 25.53 /* system context of vcpu is sent as hvm context. */
26.1 --- a/tools/libxc/xc_core.c Thu Jun 19 12:48:04 2008 +0900 26.2 +++ b/tools/libxc/xc_core.c Wed Jul 02 11:30:37 2008 +0900 26.3 @@ -407,7 +407,7 @@ xc_domain_dumpcore_via_callback(int xc_h 26.4 26.5 int nr_vcpus = 0; 26.6 char *dump_mem, *dump_mem_start = NULL; 26.7 - vcpu_guest_context_t ctxt[MAX_VIRT_CPUS]; 26.8 + vcpu_guest_context_any_t ctxt[MAX_VIRT_CPUS]; 26.9 struct xc_core_arch_context arch_ctxt; 26.10 char dummy[PAGE_SIZE]; 26.11 int dummy_len; 26.12 @@ -581,10 +581,10 @@ xc_domain_dumpcore_via_callback(int xc_h 26.13 PERROR("Could not get section header for .xen_prstatus"); 26.14 goto out; 26.15 } 26.16 - filesz = sizeof(ctxt[0]) * nr_vcpus; 26.17 + filesz = sizeof(ctxt[0].c) * nr_vcpus; 26.18 sts = xc_core_shdr_set(shdr, strtab, XEN_DUMPCORE_SEC_PRSTATUS, 26.19 SHT_PROGBITS, offset, filesz, 26.20 - __alignof__(ctxt[0]), sizeof(ctxt[0])); 26.21 + __alignof__(ctxt[0].c), sizeof(ctxt[0].c)); 26.22 if ( sts != 0 ) 26.23 goto out; 26.24 offset += filesz; 26.25 @@ -707,7 +707,7 @@ xc_domain_dumpcore_via_callback(int xc_h 26.26 goto out; 26.27 26.28 /* prstatus: .xen_prstatus */ 26.29 - sts = dump_rtn(args, (char *)&ctxt, sizeof(ctxt[0]) * nr_vcpus); 26.30 + sts = dump_rtn(args, (char *)&ctxt[0].c, sizeof(ctxt[0].c) * nr_vcpus); 26.31 if ( sts != 0 ) 26.32 goto out; 26.33
27.1 --- a/tools/libxc/xc_core_ia64.c Thu Jun 19 12:48:04 2008 +0900 27.2 +++ b/tools/libxc/xc_core_ia64.c Wed Jul 02 11:30:37 2008 +0900 27.3 @@ -308,9 +308,10 @@ xc_core_arch_context_free(struct xc_core 27.4 27.5 int 27.6 xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt, 27.7 - vcpu_guest_context_t* ctxt, 27.8 + vcpu_guest_context_any_t* ctxt_any, 27.9 int xc_handle, uint32_t domid) 27.10 { 27.11 + vcpu_guest_context_t *ctxt = &ctxt_any->c; 27.12 mapped_regs_t* mapped_regs; 27.13 27.14 if ( ctxt->privregs_pfn == VGC_PRIVREGS_HVM )
28.1 --- a/tools/libxc/xc_core_ia64.h Thu Jun 19 12:48:04 2008 +0900 28.2 +++ b/tools/libxc/xc_core_ia64.h Wed Jul 02 11:30:37 2008 +0900 28.3 @@ -40,7 +40,7 @@ void 28.4 xc_core_arch_context_free(struct xc_core_arch_context* arch_ctxt); 28.5 int 28.6 xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt, 28.7 - vcpu_guest_context_t* ctxt, 28.8 + vcpu_guest_context_any_t* ctxt, 28.9 int xc_handle, uint32_t domid); 28.10 int 28.11 xc_core_arch_context_get_shdr(struct xc_core_arch_context* arch_ctxt,
29.1 --- a/tools/libxc/xc_domain.c Thu Jun 19 12:48:04 2008 +0900 29.2 +++ b/tools/libxc/xc_domain.c Wed Jul 02 11:30:37 2008 +0900 29.3 @@ -298,30 +298,21 @@ int xc_domain_hvm_setcontext(int xc_hand 29.4 int xc_vcpu_getcontext(int xc_handle, 29.5 uint32_t domid, 29.6 uint32_t vcpu, 29.7 - vcpu_guest_context_t *ctxt) 29.8 + vcpu_guest_context_any_t *ctxt) 29.9 { 29.10 int rc; 29.11 DECLARE_DOMCTL; 29.12 - size_t sz = sizeof(vcpu_guest_context_either_t); 29.13 + size_t sz = sizeof(vcpu_guest_context_any_t); 29.14 29.15 domctl.cmd = XEN_DOMCTL_getvcpucontext; 29.16 domctl.domain = (domid_t)domid; 29.17 domctl.u.vcpucontext.vcpu = (uint16_t)vcpu; 29.18 - set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt); 29.19 + set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c); 29.20 29.21 - /* 29.22 - * We may be asked to lock either a 32-bit or a 64-bit context. Lock the 29.23 - * larger of the two if possible, otherwise fall back to native size. 29.24 - */ 29.25 + 29.26 if ( (rc = lock_pages(ctxt, sz)) != 0 ) 29.27 - { 29.28 - sz = sizeof(*ctxt); 29.29 - if ( (rc = lock_pages(ctxt, sz)) != 0 ) 29.30 - return rc; 29.31 - } 29.32 - 29.33 + return rc; 29.34 rc = do_domctl(xc_handle, &domctl); 29.35 - 29.36 unlock_pages(ctxt, sz); 29.37 29.38 return rc; 29.39 @@ -626,32 +617,28 @@ int xc_availheap(int xc_handle, 29.40 int xc_vcpu_setcontext(int xc_handle, 29.41 uint32_t domid, 29.42 uint32_t vcpu, 29.43 - vcpu_guest_context_t *ctxt) 29.44 + vcpu_guest_context_any_t *ctxt) 29.45 { 29.46 DECLARE_DOMCTL; 29.47 int rc; 29.48 - size_t sz = sizeof(vcpu_guest_context_either_t); 29.49 + size_t sz = sizeof(vcpu_guest_context_any_t); 29.50 + 29.51 + if (ctxt == NULL) 29.52 + { 29.53 + errno = EINVAL; 29.54 + return -1; 29.55 + } 29.56 29.57 domctl.cmd = XEN_DOMCTL_setvcpucontext; 29.58 domctl.domain = domid; 29.59 domctl.u.vcpucontext.vcpu = vcpu; 29.60 - set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt); 29.61 + set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c); 29.62 29.63 - /* 29.64 - * We may be asked to lock either a 32-bit or a 64-bit context. Lock the 29.65 - * larger of the two if possible, otherwise fall back to native size. 29.66 - */ 29.67 - if ( (ctxt != NULL) && (rc = lock_pages(ctxt, sz)) != 0 ) 29.68 - { 29.69 - sz = sizeof(*ctxt); 29.70 - if ( (rc = lock_pages(ctxt, sz)) != 0 ) 29.71 - return rc; 29.72 - } 29.73 - 29.74 + if ( (rc = lock_pages(ctxt, sz)) != 0 ) 29.75 + return rc; 29.76 rc = do_domctl(xc_handle, &domctl); 29.77 - 29.78 - if ( ctxt != NULL ) 29.79 - unlock_pages(ctxt, sz); 29.80 + 29.81 + unlock_pages(ctxt, sz); 29.82 29.83 return rc; 29.84 }
30.1 --- a/tools/libxc/xc_domain_restore.c Thu Jun 19 12:48:04 2008 +0900 30.2 +++ b/tools/libxc/xc_domain_restore.c Wed Jul 02 11:30:37 2008 +0900 30.3 @@ -153,7 +153,7 @@ static xen_pfn_t *load_p2m_frame_list( 30.4 int io_fd, int *pae_extended_cr3, int *ext_vcpucontext) 30.5 { 30.6 xen_pfn_t *p2m_frame_list; 30.7 - vcpu_guest_context_either_t ctxt; 30.8 + vcpu_guest_context_any_t ctxt; 30.9 xen_pfn_t p2m_fl_zero; 30.10 30.11 /* Read first entry of P2M list, or extended-info signature (~0UL). */ 30.12 @@ -284,12 +284,12 @@ int xc_domain_restore(int xc_handle, int 30.13 /* The new domain's shared-info frame number. */ 30.14 unsigned long shared_info_frame; 30.15 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */ 30.16 - shared_info_either_t *old_shared_info = 30.17 - (shared_info_either_t *)shared_info_page; 30.18 - shared_info_either_t *new_shared_info; 30.19 + shared_info_any_t *old_shared_info = 30.20 + (shared_info_any_t *)shared_info_page; 30.21 + shared_info_any_t *new_shared_info; 30.22 30.23 /* A copy of the CPU context of the guest. */ 30.24 - vcpu_guest_context_either_t ctxt; 30.25 + vcpu_guest_context_any_t ctxt; 30.26 30.27 /* A table containing the type of each PFN (/not/ MFN!). */ 30.28 unsigned long *pfn_type = NULL; 30.29 @@ -304,7 +304,7 @@ int xc_domain_restore(int xc_handle, int 30.30 xen_pfn_t *p2m_frame_list = NULL; 30.31 30.32 /* A temporary mapping of the guest's start_info page. */ 30.33 - start_info_either_t *start_info; 30.34 + start_info_any_t *start_info; 30.35 30.36 /* Our mapping of the current region (batch) */ 30.37 char *region_base;
31.1 --- a/tools/libxc/xc_domain_save.c Thu Jun 19 12:48:04 2008 +0900 31.2 +++ b/tools/libxc/xc_domain_save.c Wed Jul 02 11:30:37 2008 +0900 31.3 @@ -412,7 +412,7 @@ static int suspend_and_state(int (*suspe 31.4 ** it to update the MFN to a reasonable value. 31.5 */ 31.6 static void *map_frame_list_list(int xc_handle, uint32_t dom, 31.7 - shared_info_either_t *shinfo) 31.8 + shared_info_any_t *shinfo) 31.9 { 31.10 int count = 100; 31.11 void *p; 31.12 @@ -628,9 +628,9 @@ static xen_pfn_t *map_and_save_p2m_table 31.13 int io_fd, 31.14 uint32_t dom, 31.15 unsigned long p2m_size, 31.16 - shared_info_either_t *live_shinfo) 31.17 + shared_info_any_t *live_shinfo) 31.18 { 31.19 - vcpu_guest_context_either_t ctxt; 31.20 + vcpu_guest_context_any_t ctxt; 31.21 31.22 /* Double and single indirect references to the live P2M table */ 31.23 void *live_p2m_frame_list_list = NULL; 31.24 @@ -735,7 +735,7 @@ static xen_pfn_t *map_and_save_p2m_table 31.25 p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]); 31.26 } 31.27 31.28 - if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) ) 31.29 + if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) ) 31.30 { 31.31 ERROR("Could not get vcpu context"); 31.32 goto out; 31.33 @@ -814,7 +814,7 @@ int xc_domain_save(int xc_handle, int io 31.34 unsigned long shared_info_frame; 31.35 31.36 /* A copy of the CPU context of the guest. */ 31.37 - vcpu_guest_context_either_t ctxt; 31.38 + vcpu_guest_context_any_t ctxt; 31.39 31.40 /* A table containing the type of each PFN (/not/ MFN!). */ 31.41 unsigned long *pfn_type = NULL; 31.42 @@ -824,7 +824,7 @@ int xc_domain_save(int xc_handle, int io 31.43 char page[PAGE_SIZE]; 31.44 31.45 /* Live mapping of shared info structure */ 31.46 - shared_info_either_t *live_shinfo = NULL; 31.47 + shared_info_any_t *live_shinfo = NULL; 31.48 31.49 /* base of the region in which domain memory is mapped */ 31.50 unsigned char *region_base = NULL; 31.51 @@ -1536,7 +1536,7 @@ int xc_domain_save(int xc_handle, int io 31.52 } 31.53 } 31.54 31.55 - if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) ) 31.56 + if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) ) 31.57 { 31.58 ERROR("Could not get vcpu context"); 31.59 goto out; 31.60 @@ -1556,7 +1556,7 @@ int xc_domain_save(int xc_handle, int io 31.61 if ( !(vcpumap & (1ULL << i)) ) 31.62 continue; 31.63 31.64 - if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) ) 31.65 + if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) ) 31.66 { 31.67 ERROR("No context for VCPU%d", i); 31.68 goto out; 31.69 @@ -1624,7 +1624,7 @@ int xc_domain_save(int xc_handle, int io 31.70 * Reset the MFN to be a known-invalid value. See map_frame_list_list(). 31.71 */ 31.72 memcpy(page, live_shinfo, PAGE_SIZE); 31.73 - SET_FIELD(((shared_info_either_t *)page), 31.74 + SET_FIELD(((shared_info_any_t *)page), 31.75 arch.pfn_to_mfn_frame_list_list, 0); 31.76 if ( write_exact(io_fd, page, PAGE_SIZE) ) 31.77 {
32.1 --- a/tools/libxc/xc_misc.c Thu Jun 19 12:48:04 2008 +0900 32.2 +++ b/tools/libxc/xc_misc.c Wed Jul 02 11:30:37 2008 +0900 32.3 @@ -267,6 +267,34 @@ int xc_hvm_track_dirty_vram( 32.4 return rc; 32.5 } 32.6 32.7 +int xc_hvm_modified_memory( 32.8 + int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr) 32.9 +{ 32.10 + DECLARE_HYPERCALL; 32.11 + struct xen_hvm_modified_memory arg; 32.12 + int rc; 32.13 + 32.14 + hypercall.op = __HYPERVISOR_hvm_op; 32.15 + hypercall.arg[0] = HVMOP_modified_memory; 32.16 + hypercall.arg[1] = (unsigned long)&arg; 32.17 + 32.18 + arg.domid = dom; 32.19 + arg.first_pfn = first_pfn; 32.20 + arg.nr = nr; 32.21 + 32.22 + if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 ) 32.23 + { 32.24 + PERROR("Could not lock memory"); 32.25 + return rc; 32.26 + } 32.27 + 32.28 + rc = do_xen_hypercall(xc_handle, &hypercall); 32.29 + 32.30 + unlock_pages(&arg, sizeof(arg)); 32.31 + 32.32 + return rc; 32.33 +} 32.34 + 32.35 void *xc_map_foreign_pages(int xc_handle, uint32_t dom, int prot, 32.36 const xen_pfn_t *arr, int num) 32.37 {
33.1 --- a/tools/libxc/xc_pagetab.c Thu Jun 19 12:48:04 2008 +0900 33.2 +++ b/tools/libxc/xc_pagetab.c Wed Jul 02 11:30:37 2008 +0900 33.3 @@ -48,7 +48,7 @@ 33.4 unsigned long xc_translate_foreign_address(int xc_handle, uint32_t dom, 33.5 int vcpu, unsigned long long virt ) 33.6 { 33.7 - vcpu_guest_context_t ctx; 33.8 + vcpu_guest_context_any_t ctx; 33.9 unsigned long long cr3; 33.10 void *pd, *pt, *pdppage = NULL, *pdp, *pml = NULL; 33.11 unsigned long long pde, pte, pdpe, pmle; 33.12 @@ -78,7 +78,7 @@ unsigned long xc_translate_foreign_addre 33.13 DPRINTF("failed to retreive vcpu context\n"); 33.14 goto out; 33.15 } 33.16 - cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.ctrlreg[3])) << PAGE_SHIFT; 33.17 + cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.c.ctrlreg[3])) << PAGE_SHIFT; 33.18 33.19 /* Page Map Level 4 */ 33.20
34.1 --- a/tools/libxc/xc_private.h Thu Jun 19 12:48:04 2008 +0900 34.2 +++ b/tools/libxc/xc_private.h Wed Jul 02 11:30:37 2008 +0900 34.3 @@ -188,9 +188,9 @@ int xc_map_foreign_ranges(int xc_handle, 34.4 privcmd_mmap_entry_t *entries, int nr); 34.5 34.6 void *map_domain_va_core(unsigned long domfd, int cpu, void *guest_va, 34.7 - vcpu_guest_context_t *ctxt); 34.8 + vcpu_guest_context_any_t *ctxt); 34.9 int xc_waitdomain_core(int xc_handle, int domain, int *status, 34.10 - int options, vcpu_guest_context_t *ctxt); 34.11 + int options, vcpu_guest_context_any_t *ctxt); 34.12 34.13 void bitmap_64_to_byte(uint8_t *bp, const uint64_t *lp, int nbits); 34.14 void bitmap_byte_to_64(uint64_t *lp, const uint8_t *bp, int nbits);
35.1 --- a/tools/libxc/xc_ptrace.c Thu Jun 19 12:48:04 2008 +0900 35.2 +++ b/tools/libxc/xc_ptrace.c Wed Jul 02 11:30:37 2008 +0900 35.3 @@ -40,9 +40,9 @@ static int current_domid = -1; 35.4 static int current_isfile; 35.5 static int current_is_hvm; 35.6 35.7 -static uint64_t online_cpumap; 35.8 -static uint64_t regs_valid; 35.9 -static vcpu_guest_context_t ctxt[MAX_VIRT_CPUS]; 35.10 +static uint64_t online_cpumap; 35.11 +static uint64_t regs_valid; 35.12 +static vcpu_guest_context_any_t ctxt[MAX_VIRT_CPUS]; 35.13 35.14 extern int ffsll(long long int); 35.15 #define FOREACH_CPU(cpumap, i) for ( cpumap = online_cpumap; (i = ffsll(cpumap)); cpumap &= ~(1 << (index - 1)) ) 35.16 @@ -96,9 +96,9 @@ xc_register_event_handler(thr_ev_handler 35.17 } 35.18 35.19 static inline int 35.20 -paging_enabled(vcpu_guest_context_t *v) 35.21 +paging_enabled(vcpu_guest_context_any_t *v) 35.22 { 35.23 - unsigned long cr0 = v->ctrlreg[0]; 35.24 + unsigned long cr0 = v->c.ctrlreg[0]; 35.25 return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG); 35.26 } 35.27 35.28 @@ -174,7 +174,7 @@ map_domain_va_32( 35.29 35.30 l2 = xc_map_foreign_range( 35.31 xc_handle, current_domid, PAGE_SIZE, PROT_READ, 35.32 - xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3])); 35.33 + xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3])); 35.34 if ( l2 == NULL ) 35.35 return NULL; 35.36 35.37 @@ -216,7 +216,7 @@ map_domain_va_pae( 35.38 35.39 l3 = xc_map_foreign_range( 35.40 xc_handle, current_domid, PAGE_SIZE, PROT_READ, 35.41 - xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3])); 35.42 + xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3])); 35.43 if ( l3 == NULL ) 35.44 return NULL; 35.45 35.46 @@ -264,12 +264,12 @@ map_domain_va_64( 35.47 uint64_t *l4, *l3, *l2, *l1; 35.48 static void *v[MAX_VIRT_CPUS]; 35.49 35.50 - if ((ctxt[cpu].ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */ 35.51 + if ((ctxt[cpu].c.ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */ 35.52 return map_domain_va_32(xc_handle, cpu, guest_va, perm); 35.53 35.54 l4 = xc_map_foreign_range( 35.55 xc_handle, current_domid, PAGE_SIZE, PROT_READ, 35.56 - xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3])); 35.57 + xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3])); 35.58 if ( l4 == NULL ) 35.59 return NULL; 35.60 35.61 @@ -494,26 +494,26 @@ xc_ptrace( 35.62 case PTRACE_GETREGS: 35.63 if (!current_isfile && fetch_regs(xc_handle, cpu, NULL)) 35.64 goto out_error; 35.65 - SET_PT_REGS(pt, ctxt[cpu].user_regs); 35.66 + SET_PT_REGS(pt, ctxt[cpu].c.user_regs); 35.67 memcpy(data, &pt, sizeof(struct gdb_regs)); 35.68 break; 35.69 35.70 case PTRACE_GETFPREGS: 35.71 if (!current_isfile && fetch_regs(xc_handle, cpu, NULL)) 35.72 goto out_error; 35.73 - memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof (elf_fpregset_t)); 35.74 + memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof (elf_fpregset_t)); 35.75 break; 35.76 35.77 case PTRACE_GETFPXREGS: 35.78 if (!current_isfile && fetch_regs(xc_handle, cpu, NULL)) 35.79 goto out_error; 35.80 - memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); 35.81 + memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof(ctxt[cpu].c.fpu_ctxt)); 35.82 break; 35.83 35.84 case PTRACE_SETREGS: 35.85 if (current_isfile) 35.86 goto out_unsupported; /* XXX not yet supported */ 35.87 - SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].user_regs); 35.88 + SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].c.user_regs); 35.89 if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu, 35.90 &ctxt[cpu]))) 35.91 goto out_error_domctl; 35.92 @@ -525,7 +525,7 @@ xc_ptrace( 35.93 /* XXX we can still have problems if the user switches threads 35.94 * during single-stepping - but that just seems retarded 35.95 */ 35.96 - ctxt[cpu].user_regs.eflags |= PSL_T; 35.97 + ctxt[cpu].c.user_regs.eflags |= PSL_T; 35.98 if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu, 35.99 &ctxt[cpu]))) 35.100 goto out_error_domctl; 35.101 @@ -542,9 +542,9 @@ xc_ptrace( 35.102 if (fetch_regs(xc_handle, cpu, NULL)) 35.103 goto out_error; 35.104 /* Clear trace flag */ 35.105 - if ( ctxt[cpu].user_regs.eflags & PSL_T ) 35.106 + if ( ctxt[cpu].c.user_regs.eflags & PSL_T ) 35.107 { 35.108 - ctxt[cpu].user_regs.eflags &= ~PSL_T; 35.109 + ctxt[cpu].c.user_regs.eflags &= ~PSL_T; 35.110 if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, 35.111 cpu, &ctxt[cpu]))) 35.112 goto out_error_domctl;
36.1 --- a/tools/libxc/xc_ptrace_core.c Thu Jun 19 12:48:04 2008 +0900 36.2 +++ b/tools/libxc/xc_ptrace_core.c Wed Jul 02 11:30:37 2008 +0900 36.3 @@ -641,24 +641,24 @@ static const struct xc_core_format_type* 36.4 36.5 void * 36.6 map_domain_va_core(unsigned long domfd, int cpu, void *guest_va, 36.7 - vcpu_guest_context_t *ctxt) 36.8 + vcpu_guest_context_any_t *ctxt) 36.9 { 36.10 if (current_format_type == NULL) 36.11 return NULL; 36.12 return (current_format_type->map_domain_va_core)(domfd, cpu, guest_va, 36.13 - ctxt); 36.14 + &ctxt->c); 36.15 } 36.16 36.17 int 36.18 xc_waitdomain_core(int xc_handle, int domfd, int *status, int options, 36.19 - vcpu_guest_context_t *ctxt) 36.20 + vcpu_guest_context_any_t *ctxt) 36.21 { 36.22 int ret; 36.23 int i; 36.24 36.25 for (i = 0; i < NR_FORMAT_TYPE; i++) { 36.26 ret = (format_type[i].waitdomain_core)(xc_handle, domfd, status, 36.27 - options, ctxt); 36.28 + options, &ctxt->c); 36.29 if (ret == 0) { 36.30 current_format_type = &format_type[i]; 36.31 break;
37.1 --- a/tools/libxc/xc_resume.c Thu Jun 19 12:48:04 2008 +0900 37.2 +++ b/tools/libxc/xc_resume.c Wed Jul 02 11:30:37 2008 +0900 37.3 @@ -13,7 +13,7 @@ 37.4 37.5 static int modify_returncode(int xc_handle, uint32_t domid) 37.6 { 37.7 - vcpu_guest_context_either_t ctxt; 37.8 + vcpu_guest_context_any_t ctxt; 37.9 xc_dominfo_t info; 37.10 xen_capabilities_info_t caps; 37.11 int rc; 37.12 @@ -39,7 +39,7 @@ static int modify_returncode(int xc_hand 37.13 return -1; 37.14 } 37.15 37.16 - if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt.c)) != 0 ) 37.17 + if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt)) != 0 ) 37.18 return rc; 37.19 37.20 if ( !info.hvm ) 37.21 @@ -49,7 +49,7 @@ static int modify_returncode(int xc_hand 37.22 else 37.23 ctxt.x32.user_regs.eax = 1; 37.24 37.25 - if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt.c)) != 0 ) 37.26 + if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt)) != 0 ) 37.27 return rc; 37.28 37.29 return 0; 37.30 @@ -89,7 +89,7 @@ static int xc_domain_resume_any(int xc_h 37.31 int i, rc = -1; 37.32 #if defined(__i386__) || defined(__x86_64__) 37.33 unsigned long mfn, p2m_size = 0; 37.34 - vcpu_guest_context_t ctxt; 37.35 + vcpu_guest_context_any_t ctxt; 37.36 start_info_t *start_info; 37.37 shared_info_t *shinfo = NULL; 37.38 xen_pfn_t *p2m_frame_list_list = NULL; 37.39 @@ -167,7 +167,7 @@ static int xc_domain_resume_any(int xc_h 37.40 goto out; 37.41 } 37.42 37.43 - mfn = ctxt.user_regs.edx; 37.44 + mfn = ctxt.c.user_regs.edx; 37.45 37.46 start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, 37.47 PROT_READ | PROT_WRITE, mfn);
38.1 --- a/tools/libxc/xenctrl.h Thu Jun 19 12:48:04 2008 +0900 38.2 +++ b/tools/libxc/xenctrl.h Wed Jul 02 11:30:37 2008 +0900 38.3 @@ -31,6 +31,11 @@ 38.4 #include <xen/xsm/acm_ops.h> 38.5 #include <xen/xsm/flask_op.h> 38.6 38.7 +#if defined(__i386__) || defined(__x86_64__) 38.8 +#include <xen/foreign/x86_32.h> 38.9 +#include <xen/foreign/x86_64.h> 38.10 +#endif 38.11 + 38.12 #ifdef __ia64__ 38.13 #define XC_PAGE_SHIFT 14 38.14 #else 38.15 @@ -162,6 +167,35 @@ typedef struct xc_dominfo { 38.16 } xc_dominfo_t; 38.17 38.18 typedef xen_domctl_getdomaininfo_t xc_domaininfo_t; 38.19 + 38.20 +typedef union 38.21 +{ 38.22 +#if defined(__i386__) || defined(__x86_64__) 38.23 + vcpu_guest_context_x86_64_t x64; 38.24 + vcpu_guest_context_x86_32_t x32; 38.25 +#endif 38.26 + vcpu_guest_context_t c; 38.27 +} vcpu_guest_context_any_t; 38.28 + 38.29 +typedef union 38.30 +{ 38.31 +#if defined(__i386__) || defined(__x86_64__) 38.32 + shared_info_x86_64_t x64; 38.33 + shared_info_x86_32_t x32; 38.34 +#endif 38.35 + shared_info_t s; 38.36 +} shared_info_any_t; 38.37 + 38.38 +typedef union 38.39 +{ 38.40 +#if defined(__i386__) || defined(__x86_64__) 38.41 + start_info_x86_64_t x64; 38.42 + start_info_x86_32_t x32; 38.43 +#endif 38.44 + start_info_t s; 38.45 +} start_info_any_t; 38.46 + 38.47 + 38.48 int xc_domain_create(int xc_handle, 38.49 uint32_t ssidref, 38.50 xen_domain_handle_t handle, 38.51 @@ -307,7 +341,7 @@ int xc_domain_getinfo(int xc_handle, 38.52 int xc_vcpu_setcontext(int xc_handle, 38.53 uint32_t domid, 38.54 uint32_t vcpu, 38.55 - vcpu_guest_context_t *ctxt); 38.56 + vcpu_guest_context_any_t *ctxt); 38.57 /** 38.58 * This function will return information about one or more domains, using a 38.59 * single hypercall. The domain information will be stored into the supplied 38.60 @@ -368,7 +402,7 @@ int xc_domain_hvm_setcontext(int xc_hand 38.61 int xc_vcpu_getcontext(int xc_handle, 38.62 uint32_t domid, 38.63 uint32_t vcpu, 38.64 - vcpu_guest_context_t *ctxt); 38.65 + vcpu_guest_context_any_t *ctxt); 38.66 38.67 typedef xen_domctl_getvcpuinfo_t xc_vcpuinfo_t; 38.68 int xc_vcpu_getinfo(int xc_handle, 38.69 @@ -895,6 +929,12 @@ int xc_hvm_track_dirty_vram( 38.70 uint64_t first_pfn, uint64_t nr, 38.71 unsigned long *bitmap); 38.72 38.73 +/* 38.74 + * Notify that some pages got modified by the Device Model 38.75 + */ 38.76 +int xc_hvm_modified_memory( 38.77 + int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr); 38.78 + 38.79 typedef enum { 38.80 XC_ERROR_NONE = 0, 38.81 XC_INTERNAL_ERROR = 1,
39.1 --- a/tools/libxc/xg_save_restore.h Thu Jun 19 12:48:04 2008 +0900 39.2 +++ b/tools/libxc/xg_save_restore.h Wed Jul 02 11:30:37 2008 +0900 39.3 @@ -112,28 +112,6 @@ static inline int get_platform_info(int 39.4 #define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL)) 39.5 39.6 39.7 -/* 32-on-64 support: saving 32bit guests from 64bit tools and vice versa */ 39.8 -typedef union 39.9 -{ 39.10 - vcpu_guest_context_x86_64_t x64; 39.11 - vcpu_guest_context_x86_32_t x32; 39.12 - vcpu_guest_context_t c; 39.13 -} vcpu_guest_context_either_t; 39.14 - 39.15 -typedef union 39.16 -{ 39.17 - shared_info_x86_64_t x64; 39.18 - shared_info_x86_32_t x32; 39.19 - shared_info_t s; 39.20 -} shared_info_either_t; 39.21 - 39.22 -typedef union 39.23 -{ 39.24 - start_info_x86_64_t x64; 39.25 - start_info_x86_32_t x32; 39.26 - start_info_t s; 39.27 -} start_info_either_t; 39.28 - 39.29 #define GET_FIELD(_p, _f) ((guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f)) 39.30 39.31 #define SET_FIELD(_p, _f, _v) do { \
40.1 --- a/tools/python/xen/util/blkif.py Thu Jun 19 12:48:04 2008 +0900 40.2 +++ b/tools/python/xen/util/blkif.py Wed Jul 02 11:30:37 2008 +0900 40.3 @@ -16,8 +16,11 @@ def blkdev_name_to_number(name): 40.4 40.5 n = expand_dev_name(name) 40.6 40.7 + devname = 'virtual-device' 40.8 + devnum = None 40.9 + 40.10 try: 40.11 - return os.stat(n).st_rdev 40.12 + return (devname, os.stat(n).st_rdev) 40.13 except Exception, ex: 40.14 pass 40.15 40.16 @@ -25,28 +28,30 @@ def blkdev_name_to_number(name): 40.17 if re.match( '/dev/sd[a-z]([1-9]|1[0-5])?$', n): 40.18 major = scsi_major[(ord(n[7:8]) - ord('a')) / 16] 40.19 minor = ((ord(n[7:8]) - ord('a')) % 16) * 16 + int(n[8:] or 0) 40.20 - return major * 256 + minor 40.21 - if re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n): 40.22 + devnum = major * 256 + minor 40.23 + elif re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n): 40.24 major = scsi_major[((ord(n[7:8]) - ord('a') + 1) * 26 + (ord(n[8:9]) - ord('a'))) / 16 ] 40.25 minor = (((ord(n[7:8]) - ord('a') + 1 ) * 26 + (ord(n[8:9]) - ord('a'))) % 16) * 16 + int(n[9:] or 0) 40.26 - return major * 256 + minor 40.27 - 40.28 - if re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n): 40.29 + devnum = major * 256 + minor 40.30 + elif re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n): 40.31 ide_majors = [ 3, 22, 33, 34, 56, 57, 88, 89, 90, 91 ] 40.32 major = ide_majors[(ord(n[7:8]) - ord('a')) / 2] 40.33 minor = ((ord(n[7:8]) - ord('a')) % 2) * 64 + int(n[8:] or 0) 40.34 - return major * 256 + minor 40.35 - 40.36 - if re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?', n): 40.37 - return 202 * 256 + 16 * (ord(n[8:9]) - ord('a')) + int(n[9:] or 0) 40.38 + devnum = major * 256 + minor 40.39 + elif re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?$', n): 40.40 + devnum = (202 << 8) + ((ord(n[8:9]) - ord('a')) << 4) + int(n[9:] or 0) 40.41 + elif re.match('/dev/xvd[q-z]([1-9]|1[0-5])?$', n): 40.42 + devname = 'virtual-device-ext' 40.43 + devnum = (1 << 28) + ((ord(n[8:9]) - ord('a')) << 8) + int(n[9:] or 0) 40.44 + elif re.match('/dev/xvd[a-i][a-z]([1-9]|1[0-5])?$', n): 40.45 + devname = 'virtual-device-ext' 40.46 + devnum = (1 << 28) + (((ord(n[8:9]) - ord('a') + 1) * 26 + (ord(n[9:10]) - ord('a'))) << 8) + int(n[10:] or 0) 40.47 + elif re.match( '^(0x)[0-9a-fA-F]+$', name ): 40.48 + devnum = string.atoi(name, 16) 40.49 + elif re.match('^[0-9]+$', name): 40.50 + devnum = string.atoi(name, 10) 40.51 40.52 - if re.match( '^(0x)[0-9a-fA-F]+$', name ): 40.53 - return string.atoi(name,16) 40.54 - 40.55 - if re.match('^[0-9]+$', name): 40.56 - return string.atoi(name, 10) 40.57 - 40.58 - return None 40.59 + return (devname, devnum) 40.60 40.61 def blkdev_segment(name): 40.62 """Take the given block-device name (e.g. '/dev/sda1', 'hda') 40.63 @@ -58,7 +63,7 @@ def blkdev_segment(name): 40.64 type: 'Disk' or identifying name for partition type 40.65 """ 40.66 val = None 40.67 - n = blkdev_name_to_number(name) 40.68 + (name, n) = blkdev_name_to_number(name) 40.69 if not n is None: 40.70 val = { 'device' : n, 40.71 'start_sector' : long(0),
41.1 --- a/tools/python/xen/xend/XendConfig.py Thu Jun 19 12:48:04 2008 +0900 41.2 +++ b/tools/python/xen/xend/XendConfig.py Wed Jul 02 11:30:37 2008 +0900 41.3 @@ -1123,7 +1123,7 @@ class XendConfig(dict): 41.4 try: 41.5 devid = int(dev2) 41.6 except ValueError: 41.7 - devid = blkdev_name_to_number(dev2) 41.8 + (xenbus, devid) = blkdev_name_to_number(dev2) 41.9 if devid == None: 41.10 log.debug("The device %s is not device name", dev2) 41.11 return None
42.1 --- a/tools/python/xen/xend/XendOptions.py Thu Jun 19 12:48:04 2008 +0900 42.2 +++ b/tools/python/xen/xend/XendOptions.py Wed Jul 02 11:30:37 2008 +0900 42.3 @@ -132,6 +132,9 @@ class XendOptions: 42.4 """Default script to configure a backend network interface""" 42.5 vif_script = osdep.vif_script 42.6 42.7 + """Default rotation count of qemu-dm log file.""" 42.8 + qemu_dm_logrotate_count = 10 42.9 + 42.10 def __init__(self): 42.11 self.configure() 42.12 42.13 @@ -351,6 +354,10 @@ class XendOptions: 42.14 def get_vnc_x509_verify(self): 42.15 return self.get_config_string('vnc-x509-verify', self.xend_vnc_x509_verify) 42.16 42.17 + def get_qemu_dm_logrotate_count(self): 42.18 + return self.get_config_int("qemu-dm-logrotate-count", 42.19 + self.qemu_dm_logrotate_count) 42.20 + 42.21 42.22 class XendOptionsFile(XendOptions): 42.23
43.1 --- a/tools/python/xen/xend/image.py Thu Jun 19 12:48:04 2008 +0900 43.2 +++ b/tools/python/xen/xend/image.py Wed Jul 02 11:30:37 2008 +0900 43.3 @@ -378,13 +378,23 @@ class ImageHandler: 43.4 # keep track of pid and spawned options to kill it later 43.5 43.6 self.logfile = "/var/log/xen/qemu-dm-%s.log" % str(self.vm.info['name_label']) 43.7 - if os.path.exists(self.logfile): 43.8 - if os.path.exists(self.logfile + ".1"): 43.9 - os.unlink(self.logfile + ".1") 43.10 - os.rename(self.logfile, self.logfile + ".1") 43.11 + 43.12 + # rotate log 43.13 + logfile_mode = os.O_WRONLY|os.O_CREAT|os.O_APPEND 43.14 + logrotate_count = XendOptions.instance().get_qemu_dm_logrotate_count() 43.15 + if logrotate_count > 0: 43.16 + logfile_mode |= os.O_TRUNC 43.17 + if os.path.exists("%s.%d" % (self.logfile, logrotate_count)): 43.18 + os.unlink("%s.%d" % (self.logfile, logrotate_count)) 43.19 + for n in range(logrotate_count - 1, 0, -1): 43.20 + if os.path.exists("%s.%d" % (self.logfile, n)): 43.21 + os.rename("%s.%d" % (self.logfile, n), 43.22 + "%s.%d" % (self.logfile, (n + 1))) 43.23 + if os.path.exists(self.logfile): 43.24 + os.rename(self.logfile, self.logfile + ".1") 43.25 43.26 null = os.open("/dev/null", os.O_RDONLY) 43.27 - logfd = os.open(self.logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND) 43.28 + logfd = os.open(self.logfile, logfile_mode) 43.29 43.30 sys.stderr.flush() 43.31 pid = os.fork()
44.1 --- a/tools/python/xen/xend/server/blkif.py Thu Jun 19 12:48:04 2008 +0900 44.2 +++ b/tools/python/xen/xend/server/blkif.py Wed Jul 02 11:30:37 2008 +0900 44.3 @@ -81,11 +81,11 @@ class BlkifController(DevController): 44.4 if security.on() == xsconstants.XS_POLICY_ACM: 44.5 self.do_access_control(config, uname) 44.6 44.7 - devid = blkif.blkdev_name_to_number(dev) 44.8 + (device_path, devid) = blkif.blkdev_name_to_number(dev) 44.9 if devid is None: 44.10 raise VmError('Unable to find number for device (%s)' % (dev)) 44.11 44.12 - front = { 'virtual-device' : "%i" % devid, 44.13 + front = { device_path : "%i" % devid, 44.14 'device-type' : dev_type 44.15 } 44.16 44.17 @@ -204,5 +204,5 @@ class BlkifController(DevController): 44.18 dev = devid.split('/')[-1] 44.19 dev = int(dev) 44.20 except ValueError: 44.21 - dev = blkif.blkdev_name_to_number(dev) 44.22 + (device_path, dev) = blkif.blkdev_name_to_number(dev) 44.23 return dev
45.1 --- a/tools/python/xen/xm/main.py Thu Jun 19 12:48:04 2008 +0900 45.2 +++ b/tools/python/xen/xm/main.py Wed Jul 02 11:30:37 2008 +0900 45.3 @@ -2022,8 +2022,7 @@ def xm_block_list(args): 45.4 map(server.xenapi.VBD.get_runtime_properties, vbd_refs) 45.5 vbd_devs = \ 45.6 map(server.xenapi.VBD.get_device, vbd_refs) 45.7 - vbd_devids = \ 45.8 - map(blkdev_name_to_number, vbd_devs) 45.9 + vbd_devids = [blkdev_name_to_number(x)[1] for x in vbd_devs] 45.10 devs = map(lambda (devid, prop): [devid, map2sxp(prop)], 45.11 zip(vbd_devids, vbd_properties)) 45.12 else:
46.1 --- a/tools/tests/test_x86_emulator.c Thu Jun 19 12:48:04 2008 +0900 46.2 +++ b/tools/tests/test_x86_emulator.c Wed Jul 02 11:30:37 2008 +0900 46.3 @@ -22,23 +22,22 @@ 46.4 static int read( 46.5 unsigned int seg, 46.6 unsigned long offset, 46.7 - unsigned long *val, 46.8 + void *p_data, 46.9 unsigned int bytes, 46.10 struct x86_emulate_ctxt *ctxt) 46.11 { 46.12 - *val = 0; 46.13 - memcpy(val, (void *)offset, bytes); 46.14 + memcpy(p_data, (void *)offset, bytes); 46.15 return X86EMUL_OKAY; 46.16 } 46.17 46.18 static int write( 46.19 unsigned int seg, 46.20 unsigned long offset, 46.21 - unsigned long val, 46.22 + void *p_data, 46.23 unsigned int bytes, 46.24 struct x86_emulate_ctxt *ctxt) 46.25 { 46.26 - memcpy((void *)offset, &val, bytes); 46.27 + memcpy((void *)offset, p_data, bytes); 46.28 return X86EMUL_OKAY; 46.29 } 46.30
47.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 47.2 +++ b/tools/xenballoon/xenballoon-monitor Wed Jul 02 11:30:37 2008 +0900 47.3 @@ -0,0 +1,43 @@ 47.4 +#!/bin/bash 47.5 +# 47.6 +# xenballoon-monitor - monitor certain stats from xenballoond 47.7 +# (run in dom0 with "watch -d xenballoon-monitor" for xentop-like output) 47.8 +# 47.9 +# Copyright (C) 2009 Oracle Corporation and/or its affiliates. 47.10 +# All rights reserved 47.11 +# Written by: Dan Magenheimer <dan.magenheimer@oracle.com> 47.12 +# 47.13 +# Hint: Use "xm sched-credit -d 0 -w 2000" to watch on heavily loaded machines 47.14 +# 47.15 +echo "id mem-kb tgt-kb commit swapin swapout pgin pgout active(sec)" 47.16 +for i in `xenstore-list /local/domain`; do 47.17 + if [ "$i" -ne 0 ]; then 47.18 + tot=0; tgt=0; sin=0; sout=0; pgin=0; pgout=0; cmt=0; up=0; idle=0; act=0; 47.19 + if xenstore-exists /local/domain/$i/memory/meminfo; then 47.20 + tot=`xenstore-read /local/domain/$i/memory/meminfo | grep MemTotal \ 47.21 + | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'` 47.22 + cmt=`xenstore-read /local/domain/$i/memory/meminfo | grep Committed_AS \ 47.23 + | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'` 47.24 + fi 47.25 + if xenstore-exists /local/domain/$i/memory/selftarget; then 47.26 + tgt=`xenstore-read /local/domain/$i/memory/selftarget` 47.27 + fi 47.28 + if xenstore-exists /local/domain/$i/memory/vmstat; then 47.29 + sin=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpin \ 47.30 + | cut -d" " -f2` 47.31 + sout=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpout \ 47.32 + | cut -d" " -f2` 47.33 + pgin=`xenstore-read /local/domain/$i/memory/vmstat | grep pgpgin \ 47.34 + | cut -d" " -f2` 47.35 + pgout=`xenstore-read /local/domain/$i/memory/vmstat | grep pgout \ 47.36 + | cut -d" " -f2` 47.37 + fi 47.38 + if xenstore-exists /local/domain/$i/memory/uptime; then 47.39 + up=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f1` 47.40 + idle=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f2` 47.41 + act=`echo $up - $idle | bc -iq` 47.42 + fi 47.43 + printf "%2d %8d%8d%8d%9d%9d%10d%10d%10.2f\n" $i $tot $tgt $cmt $sin $sout $pgin $pgout $act 47.44 + fi 47.45 +done 47.46 +echo Free memory: `xm info | grep free | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'` MB
48.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 48.2 +++ b/tools/xenballoon/xenballoon.conf Wed Jul 02 11:30:37 2008 +0900 48.3 @@ -0,0 +1,91 @@ 48.4 +## Path: System/xen 48.5 +## Description: xen domain start/stop on boot 48.6 +## Type: string 48.7 +## Default: 48.8 + 48.9 +# NOTE: "xenbus is enabled" means not only that /proc/xen/xenbus exists 48.10 +# but also that /usr/bin/xenstore-* tools are installed. 48.11 + 48.12 +## Type: boolean 48.13 +## Default: false 48.14 +# 48.15 +# If XENBALLOON_SELF is true, selfballooning will occur, meaning the 48.16 +# balloon driver will grow and shrink according to available memory. 48.17 +# If xenbus is enabled, may be overridden by {memory/selfballoon}==0 48.18 +# If false but xenballoond is able to communicate with domain0 via 48.19 +# xenbus, balloon targets will be set by domain0 48.20 +# 48.21 +XENBALLOON_SELF=false 48.22 + 48.23 +## Type: integer (must be > 0) 48.24 +## Default: 1 48.25 +# 48.26 +# If self-ballooning, number of seconds between checks/adjustments. 48.27 +# If xenbus is enabled, may be overridden by {memory/interval} 48.28 +XENBALLOON_SELF_INTERVAL=1 48.29 + 48.30 +## Type: integer (must be > 0) 48.31 +## Default: 1 48.32 +# 48.33 +# If NOT self-ballooning but xenbus is enabled, number of seconds between 48.34 +# checks/adjustments. May be overridden by {memory/interval} 48.35 +XENBALLOON_INTERVAL=1 48.36 + 48.37 +## Type: integer (must be > 0) 48.38 +## Default: 10 48.39 +# 48.40 +# When current > target, reduces rate at which target memory is ballooned 48.41 +# out. For a value of n, 1/n of the difference will be ballooned. 48.42 +# This value applies both to selfballooning and directed ballooning. 48.43 +# May be overridden by {memory/downhysteresis} 48.44 +XENBALLOON_AUTO_DOWNHYSTERESIS=10 48.45 + 48.46 +## Type: integer (must be > 0) 48.47 +## Default: 1 48.48 +# 48.49 +# When current < target, reduces rate at which target memory is reclaimed 48.50 +# (if available). For a value of n, 1/n of the difference will be ballooned. 48.51 +# This value applies both to selfballooning and directed ballooning. 48.52 +# May be overridden by {memory/uphysteresis} 48.53 +XENBALLOON_AUTO_UPHYSTERESIS=1 48.54 + 48.55 +## Type: integer (must be >= 0) 48.56 +## Default: 0 48.57 +# 48.58 +# In order to avoid ballooning so much memory that a guest experiences 48.59 +# out-of-memory errors (OOMs), memory will not be ballooned out below 48.60 +# a minimum target, in MB. If this value is 0 (default), an heuristic 48.61 +# based on the maximum amount of memory will be used. (The heuristic 48.62 +# provides the same minimum as recent versions of the balloon driver but 48.63 +# early versions of the balloon driver did not enforce a minimum.) 48.64 +XENBALLOON_MINMEM=0 48.65 + 48.66 +## Type: string 48.67 +## Default: "/var/run/xenballoon-maxmem" 48.68 +# 48.69 +# Location where memory high-water mark is stored; if a guest supports 48.70 +# hot-add memory, maxmem might increase across time and the minimum 48.71 +# target heuristic is based on max memory. NOTE: Reboot after changing 48.72 +# this variable, else overballooning may occur. 48.73 +XENBALLOON_MAXMEMFILE=/var/run/xenballoon-maxmem 48.74 + 48.75 +## Type: integer (0 or 1) 48.76 +## Default: 1 48.77 +# 48.78 +# If xenbus is enabled, whether selfballooning or directed ballooning, 48.79 +# place the result of 'cat /proc/meminfo" on xenbus at memory/meminfo 48.80 +XENBALLOON_SEND_MEMINFO=1 48.81 + 48.82 +## Type: integer (0 or 1) 48.83 +## Default: 1 48.84 +# 48.85 +# If xenbus is enabled, whether selfballooning or directed ballooning, 48.86 +# place the result of 'cat /proc/vmstat" on xenbus at memory/vmstat 48.87 +XENBALLOON_SEND_VMSTAT=1 48.88 + 48.89 +## Type: integer (0 or 1) 48.90 +## Default: 1 48.91 +# 48.92 +# If xenbus is enabled, whether selfballooning or directed ballooning, 48.93 +# place the result of 'cat /proc/uptime" on xenbus at memory/uptime 48.94 +XENBALLOON_SEND_UPTIME=1
49.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 49.2 +++ b/tools/xenballoon/xenballoond Wed Jul 02 11:30:37 2008 +0900 49.3 @@ -0,0 +1,205 @@ 49.4 +#!/bin/bash 49.5 +# 49.6 +# Copyright (C) 2008 Oracle Corporation and/or its affiliates. 49.7 +# All rights reserved. 49.8 +# Written by: Dan Magenheimer <dan.magenheimer@oracle.com> 49.9 +# 49.10 +# xenballoond - In-guest engine for Xen memory ballooning 49.11 +# Version: 080630 49.12 +# 49.13 +# Two "policies" are implemented: 49.14 +# - Selfballooning: Adjust memory periodically, with no (or little) input 49.15 +# from domain0. Target memory is determined solely by the 49.16 +# Committed_AS line in /proc/meminfo, but parameters may adjust 49.17 +# the rate at which the target is achieved. 49.18 +# - Directed ballooning: Adjust memory solely as directed by domain0 49.19 +# 49.20 +# Under some circumstances, "output" may also be generated; the contents 49.21 +# of /proc/meminfo and /proc/vmstat may be periodically placed on xenbus. 49.22 +# 49.23 +# If xenbus is running and the /usr/bin/xenstore-* tools are installed, 49.24 +# "xenbus is enabled". 49.25 +# 49.26 +# Parameters are documented in /etc/sysconfig/xenballoon.conf. Although 49.27 +# some are not used with directed ballooning, all must be set properly. 49.28 +# If xenbus is enabled, some of these parameters may be overridden by values 49.29 +# set by domain0 via xenbus. 49.30 + 49.31 +minmb() { 49.32 + RETVAL=$XENBALLOON_MINMEM 49.33 + if [ $RETVAL -ne 0 ]; then 49.34 + return $RETVAL 49.35 + fi 49.36 + kb=`cat $XENBALLOON_MAXMEMFILE` 49.37 + let "mb=$kb/1024" 49.38 + let "pages=$kb/4" 49.39 + # this algorithm from drivers/xen/balloon/balloon.c:minimum_target() 49.40 + # which was added to balloon.c in 2008 to avoid ballooning too small 49.41 + # it is unnecessary here except to accomodate pre-2008 balloon drivers 49.42 + # note that ranges are adjusted because a VM with "memory=1024" 49.43 + # gets somewhat less than 1024MB 49.44 + if [ $mb -lt 125 ]; then 49.45 + let RETVAL="$(( 8 + ($pages >> 9) ))" 49.46 + elif [ $mb -lt 500 ]; then 49.47 + let RETVAL="$(( 40 + ($pages >> 10) ))" 49.48 + elif [ $mb -lt 2000 ]; then 49.49 + let RETVAL="$(( 104 + ($pages >> 11) ))" 49.50 + else 49.51 + let RETVAL="$(( 296 + ($pages >> 13) ))" 49.52 + fi 49.53 + return # value returned in RETVAL in mB 49.54 +} 49.55 + 49.56 +curkb() { 49.57 + kb=`grep MemTotal /proc/meminfo | sed 's/ */ /' | \ 49.58 + cut -f2 -d' '` 49.59 + RETVAL=$kb 49.60 + return # value returned in RETVAL in kB 49.61 +} 49.62 + 49.63 +downhysteresis() { 49.64 + RETVAL=$XENBALLOON_AUTO_DOWNHYSTERESIS 49.65 + if [ $xenstore_enabled = "true" ]; then 49.66 + if xenstore-exists memory/downhysteresis ; then 49.67 + RETVAL=`xenstore-read memory/downhysteresis` 49.68 + fi 49.69 + fi 49.70 + return 49.71 +} 49.72 + 49.73 +uphysteresis() { 49.74 + RETVAL=$XENBALLOON_AUTO_UPHYSTERESIS 49.75 + if [ $xenstore_enabled = "true" ]; then 49.76 + if xenstore-exists memory/uphysteresis ; then 49.77 + RETVAL=`xenstore-read memory/uphysteresis` 49.78 + fi 49.79 + fi 49.80 + return 49.81 +} 49.82 + 49.83 +selfballoon_eval() { 49.84 + if [ $xenstore_enabled = "true" ]; then 49.85 + if xenstore-exists memory/selfballoon; then 49.86 + RETVAL=`xenstore-read memory/selfballoon` 49.87 + if [ $RETVAL -eq 1 ]; then 49.88 + selfballoon_enabled=true 49.89 + return 49.90 + fi 49.91 + fi 49.92 + fi 49.93 + selfballoon_enabled=$XENBALLOON_SELF 49.94 + return 49.95 +} 49.96 + 49.97 +selftarget() { 49.98 + tgtkb=`grep Committed_AS /proc/meminfo | sed 's/ */ /' | cut -f2 -d' '` 49.99 + minmb 49.100 + let "minbytes=$RETVAL*1024*1024" 49.101 + let "tgtbytes=$tgtkb*1024" 49.102 + if [ $tgtbytes -lt $minbytes ]; then 49.103 + let "tgtbytes=$minbytes" 49.104 + fi 49.105 + RETVAL=$tgtbytes # value returned in RETVAL in bytes 49.106 + return 49.107 +} 49.108 + 49.109 +# $1 == 1 means use selftarget, else target in kB 49.110 +balloon_to_target() { 49.111 + if [ "$1" -eq 1 ]; then 49.112 + selftarget 49.113 + tgtbytes=$RETVAL 49.114 + else 49.115 + let "tgtbytes=$(( $1 * 1024 ))" 49.116 + fi 49.117 + curkb 49.118 + let "curbytes=$RETVAL*1024" 49.119 + if [ $curbytes -gt $tgtbytes ]; then 49.120 + downhysteresis 49.121 + downhys=$RETVAL 49.122 + if [ $downhys -ne 0 ]; then 49.123 + let "tgtbytes=$(( $curbytes - \ 49.124 + ( ( $curbytes - $tgtbytes ) / $downhys ) ))" 49.125 + fi 49.126 + else if [ $curbytes -lt $tgtbytes ]; then 49.127 + uphysteresis 49.128 + uphys=$RETVAL 49.129 + let "tgtbytes=$(( $curbytes + \ 49.130 + ( ( $tgtbytes - $curbytes ) / $uphys ) ))" 49.131 + fi 49.132 + fi 49.133 + echo $tgtbytes > /proc/xen/balloon 49.134 + if [ $xenstore_enabled = "true" ]; then 49.135 + let "tgtkb=$(( $tgtbytes/1024 ))" 49.136 + xenstore-write memory/selftarget $tgtkb 49.137 + fi 49.138 +} 49.139 + 49.140 +send_memory_stats() { 49.141 + if [ ! $xenstore_enabled = "true" ]; then 49.142 + return 49.143 + fi 49.144 + if [ $XENBALLOON_SEND_MEMINFO ]; then 49.145 + xenstore-write memory/meminfo "`cat /proc/meminfo`" 49.146 + fi 49.147 + if [ $XENBALLOON_SEND_VMSTAT ]; then 49.148 + xenstore-write memory/vmstat "`cat /proc/vmstat`" 49.149 + fi 49.150 + if [ $XENBALLOON_SEND_UPTIME ]; then 49.151 + xenstore-write memory/uptime "`cat /proc/uptime`" 49.152 + fi 49.153 +} 49.154 + 49.155 +if [ ! -f /proc/xen/balloon ]; then 49.156 + echo "$0: no balloon driver installed" 49.157 + exit 0 49.158 +fi 49.159 +if [ ! -f /proc/meminfo ]; then 49.160 + echo "$0: can't read /proc/meminfo" 49.161 + exit 0 49.162 +fi 49.163 +xenstore_enabled=true 49.164 +if [ -f /usr/bin/xenstore-exists -a -f /usr/bin/xenstore-read -a \ 49.165 + -f /usr/bin/xenstore-write ]; then 49.166 + xenstore_enabled=true 49.167 +else 49.168 + echo "$0: missing /usr/bin/xenstore-* tools, disabling directed ballooning" 49.169 + xenstore_enabled=false 49.170 +fi 49.171 + 49.172 +. /etc/sysconfig/xenballoon.conf 49.173 + 49.174 +while true; 49.175 +do 49.176 + # handle special case for PV domains with hot-add memory 49.177 + if [ ! -f $XENBALLOON_MAXMEMFILE ]; then 49.178 + maxkb=0 49.179 + else 49.180 + maxkb=`cat $XENBALLOON_MAXMEMFILE` 49.181 + fi 49.182 + curkb=`grep MemTotal /proc/meminfo | sed 's/ */ /' | cut -f2 -d' '` 49.183 + if [ $curkb -gt $maxkb ]; then 49.184 + echo $curkb > $XENBALLOON_MAXMEMFILE 49.185 + fi 49.186 + interval=$XENBALLOON_INTERVAL 49.187 + # do self-ballooning 49.188 + selfballoon_eval 49.189 + if [ $selfballoon_enabled = "true" ]; then 49.190 + balloon_to_target 1 49.191 + interval=$XENBALLOON_SELF_INTERVAL 49.192 + # or do directed ballooning 49.193 + elif [ $xenstore_enabled = "true" ]; then 49.194 + if xenstore-exists memory/target ; then 49.195 + tgtkb=`xenstore-read memory/target` 49.196 + balloon_to_target $tgtkb 49.197 + fi 49.198 + interval=$XENBALLOON_INTERVAL 49.199 + fi 49.200 + send_memory_stats 49.201 + if [ $xenstore_enabled = "true" ]; then 49.202 + if xenstore-exists memory/interval ; then 49.203 + interval=`xenstore-read memory/interval` 49.204 + fi 49.205 + fi 49.206 + sleep $interval 49.207 +done & 49.208 +
50.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 50.2 +++ b/tools/xenballoon/xenballoond.README Wed Jul 02 11:30:37 2008 +0900 50.3 @@ -0,0 +1,82 @@ 50.4 +Xenballoond.README 50.5 +Preliminary version 0.1, 2008/06/30 50.6 + 50.7 +Copyright (C) 2008 Oracle Corporation and/or its affiliates. 50.8 +All rights reserved. 50.9 +Written by Dan Magenheimer <dan.magenheimer@oracle.com> 50.10 + 50.11 +INTRODUCTION 50.12 + 50.13 +Xenballoond runs in guest domains and both implements selfballooning and 50.14 +provides metrics to dom0 for (future) directed ballooning. Both capabilities 50.15 +provide a foundation for basic "memory overcommit" functionality. 50.16 + 50.17 +With selfballooning enabled, xenballoond uses the Committed_AS value found 50.18 +in /proc/meminfo as a first approximation of how much memory is required 50.19 +by the guest and feeds this statistic back to the balloon driver to inflate 50.20 +or deflate the balloon as required to achieve the target guest memory size. 50.21 +Hysteresis parameters may be adjusted to rate-limit balloon inflation 50.22 +and deflation. 50.23 + 50.24 +If configured, certain selfballooning parameters -- including notably 50.25 +enabling/disabling of self-ballooning -- can be controlled from domain0. 50.26 +(These are fully documented in xenballoon.conf.) 50.27 + 50.28 +If configured, the following guest statistics are sent back to domain0: 50.29 +- /proc/meminfo 50.30 +- /proc/vmstat 50.31 +- /proc/uptime 50.32 +In a future release, some of these values will be used by a policy module 50.33 +in domain0 to control guest balloon size and provide memory balancing 50.34 +across all guests on a given system. 50.35 + 50.36 +Note that no page sharing (content-based or otherwise) is implemented 50.37 +and no VMM-based swapping is necessary. 50.38 + 50.39 +For more information, see: 50.40 +http://www.xen.org/files/xensummitboston08/MemoryOvercommit-XenSummit2008.pdf 50.41 +http://wiki.xensource.com/xenwiki/Open_Topics_For_Discussion?action=AttachFile&do=get&target=Memory+Overcommit.pdf 50.42 + 50.43 +INSTALLATION AND DEPLOYMENT 50.44 + 50.45 +In this preliminary release: 50.46 +- directed ballooning is not implemented, though a monitor is provided 50.47 +- only Redhat-based guests are supported 50.48 + 50.49 +Guest prerequisites to use xenballoond: 50.50 +- each guest must be configured with adequate[1] swap space 50.51 +- each guest must have the balloon driver installed (/proc/xen/balloon exists) 50.52 +- if directed ballooning (or monitoring) is desired, xenstore tools must be 50.53 + installed in each guest in /usr/bin [2] 50.54 + 50.55 +[1] for best results, for a guest that is configured with maxmem=N and 50.56 + requires Z MB of swap space without xenballoond, available swap should 50.57 + be increased to N+Z MB when xenballoond is running 50.58 +[2] specifically xenstore-read, xenstore-exists, and xenstore-write must 50.59 + be installed. Binaries can be obtained, for example, by building 50.60 + xen-vvv.gz/tools in a guest-binary-compatible development tree 50.61 + 50.62 +Instructions to install/deploy xenballoond (in Redhat-based system): 50.63 +- in each guest: 50.64 + - ensure pre-requisites are met (see above) 50.65 + - place xenballoon.conf in /etc/sysconfig 50.66 + - place xenballoond in /usr/sbin 50.67 + - copy xenballoond.init to /etc/rc.d/init.d/xenballoond (note file rename) 50.68 + - edit /etc/sysconfig/xenballoond.conf as desired (especially note that 50.69 + selfballooning defaults as off) 50.70 + - start xenballoond with "service xenballoond start", and/or configure 50.71 + xenballoond to start at init (e.g. "chkconfig xenballoond on") 50.72 +- in domain0: 50.73 + - if monitoring is desired, xenballoon-monitor may be installed in /usr/sbin 50.74 +- note that certain xenballoond.conf variables may be overridden by domain0 50.75 + if xenstore is running in the guest; these are fully documented in 50.76 + xenballoond.conf 50.77 + 50.78 +TODO: 50.79 +080630 modifications to support SUSE-based and debian-based guests 50.80 +080630 domain0 ballooning policy module 50.81 +080630 experiment with more aggressive (optionally) memory minimum targets 50.82 +080630 BUG: xenballoond doesn't properly record the fact that it's running; 50.83 + e.g. flipping between run levels 5 and 3 launches additional daemons 50.84 +080630 BUG: reports of possible incompatibilites between ballooning and 50.85 + save/restore/migrate have not been duplicated
51.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 51.2 +++ b/tools/xenballoon/xenballoond.init Wed Jul 02 11:30:37 2008 +0900 51.3 @@ -0,0 +1,91 @@ 51.4 +#!/bin/bash 51.5 +# 51.6 +# xenballoond Script to start and stop Xen ballooning daemon. 51.7 +# 51.8 +# Copyright (C) 2008 Oracle Corporation and/or its affiliates. 51.9 +# All rights reserved. 51.10 +# Written by: Dan Magenheimer <dan.magenheimer@oracle.com> 51.11 +# 51.12 +# chkconfig: 2345 98 01 51.13 +# description: Starts and stops the Xen control daemon. 51.14 +### BEGIN INIT INFO 51.15 +# Provides: xenballoond 51.16 +# Required-Start: $syslog $remote_fs 51.17 +# Should-Start: 51.18 +# Required-Stop: $syslog $remote_fs 51.19 +# Should-Stop: 51.20 +# Default-Start: 3 4 5 51.21 +# Default-Stop: 0 1 2 6 51.22 +# Default-Enabled: yes 51.23 +# Short-Description: Start/stop xend 51.24 +# Description: Starts and stops the Xen ballooning daemon. 51.25 +### END INIT INFO 51.26 + 51.27 +# Source function library 51.28 +. /etc/init.d/functions 51.29 + 51.30 +#don't use in domain0 51.31 +[ -f /proc/xen/capabilities ] && \ 51.32 + grep -q "control_d" /proc/xen/capabilities && exit 0 51.33 + 51.34 +if [ -f /etc/sysconfig/xenballoon.conf ]; then 51.35 + . /etc/sysconfig/xenballoon.conf 51.36 +fi 51.37 + 51.38 +# Check that balloon driver is present 51.39 +[ ! -f /proc/xen/balloon ] && exit 0 51.40 + 51.41 +# Record original memory (in kB) 51.42 +[ -z "$XENBALLOON_MAXMEMFILE" ] && exit 0 51.43 +let maxmem=`grep MemTotal /proc/meminfo | sed 's/ */ /' | cut -f2 -d' '` 51.44 +if [ -f "$XENBALLOON_MAXMEMFILE" ]; then 51.45 + let oldmax=`cat $XENBALLOON_MAXMEMFILE` 51.46 + if [ $oldmax -gt $maxmem ]; then 51.47 + let maxmem=oldmax 51.48 + fi 51.49 +fi 51.50 +echo $maxmem > $XENBALLOON_MAXMEMFILE 51.51 + 51.52 +RETVAL=0 51.53 +prog="xenballoond" 51.54 + 51.55 +start() { 51.56 + # Start daemons. 51.57 + echo -n $"Starting $prog: " 51.58 + daemon xenballoond $OPTIONS 51.59 + RETVAL=$? 51.60 + echo 51.61 + return $RETVAL 51.62 +} 51.63 + 51.64 +stop() { 51.65 + echo -n $"Shutting down $prog: " 51.66 + killproc xenballoond 51.67 + RETVAL=$? 51.68 + echo 51.69 + return $RETVAL 51.70 +} 51.71 + 51.72 +# See how we were called. 51.73 +case "$1" in 51.74 + start) 51.75 + start 51.76 + ;; 51.77 + stop) 51.78 + stop 51.79 + ;; 51.80 + status) 51.81 + status xenballoond 51.82 + RETVAL=$? 51.83 + ;; 51.84 + restart|reload) 51.85 + stop 51.86 + start 51.87 + RETVAL=$? 51.88 + ;; 51.89 + *) 51.90 + echo $"Usage: $0 {start|stop|restart|status}" 51.91 + exit 1 51.92 +esac 51.93 + 51.94 +exit $RETVAL
52.1 --- a/tools/xentrace/xenctx.c Thu Jun 19 12:48:04 2008 +0900 52.2 +++ b/tools/xentrace/xenctx.c Wed Jul 02 11:30:37 2008 +0900 52.3 @@ -702,7 +702,7 @@ void print_stack(vcpu_guest_context_t *c 52.4 void dump_ctx(int vcpu) 52.5 { 52.6 int ret; 52.7 - vcpu_guest_context_t ctx; 52.8 + vcpu_guest_context_any_t ctx; 52.9 xc_dominfo_t dominfo; 52.10 52.11 xc_handle = xc_interface_open(); /* for accessing control interface */ 52.12 @@ -727,10 +727,10 @@ void dump_ctx(int vcpu) 52.13 exit(-1); 52.14 } 52.15 52.16 - print_ctx(&ctx); 52.17 + print_ctx(&ctx.c); 52.18 #ifndef NO_TRANSLATION 52.19 - if (is_kernel_text(INSTR_POINTER((&ctx.user_regs)))) 52.20 - print_stack(&ctx, vcpu); 52.21 + if (is_kernel_text(INSTR_POINTER((&ctx.c.user_regs)))) 52.22 + print_stack(&ctx.c, vcpu); 52.23 #endif 52.24 52.25 if (!dominfo.paused) {
53.1 --- a/tools/xm-test/lib/XmTestLib/block_utils.py Thu Jun 19 12:48:04 2008 +0900 53.2 +++ b/tools/xm-test/lib/XmTestLib/block_utils.py Wed Jul 02 11:30:37 2008 +0900 53.3 @@ -15,7 +15,7 @@ import xen.util.blkif 53.4 53.5 53.6 def get_state(domain, devname): 53.7 - number = xen.util.blkif.blkdev_name_to_number(devname) 53.8 + (path, number) = xen.util.blkif.blkdev_name_to_number(devname) 53.9 s, o = traceCommand("xm block-list %s | awk '/^%d/ {print $4}'" % 53.10 (domain.getName(), number)) 53.11 if s != 0:
54.1 --- a/xen/arch/ia64/vmx/vmx_hypercall.c Thu Jun 19 12:48:04 2008 +0900 54.2 +++ b/xen/arch/ia64/vmx/vmx_hypercall.c Wed Jul 02 11:30:37 2008 +0900 54.3 @@ -204,6 +204,53 @@ do_hvm_op(unsigned long op, XEN_GUEST_HA 54.4 rc = -ENOSYS; 54.5 break; 54.6 54.7 + case HVMOP_modified_memory: 54.8 + { 54.9 + struct xen_hvm_modified_memory a; 54.10 + struct domain *d; 54.11 + unsigned long pfn; 54.12 + 54.13 + if ( copy_from_guest(&a, arg, 1) ) 54.14 + return -EFAULT; 54.15 + 54.16 + if ( a.domid == DOMID_SELF ) 54.17 + { 54.18 + d = rcu_lock_current_domain(); 54.19 + } 54.20 + else 54.21 + { 54.22 + if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL ) 54.23 + return -ESRCH; 54.24 + if ( !IS_PRIV_FOR(current->domain, d) ) 54.25 + { 54.26 + rc = -EPERM; 54.27 + goto param_fail3; 54.28 + } 54.29 + } 54.30 + 54.31 + rc = -EINVAL; 54.32 + if ( !is_hvm_domain(d) ) 54.33 + goto param_fail3; 54.34 + 54.35 + rc = -EINVAL; 54.36 + if ( a.first_pfn > domain_get_maximum_gpfn(d) 54.37 + || a.first_pfn + a.nr - 1 < a.first_pfn 54.38 + || a.first_pfn + a.nr - 1 > domain_get_maximum_gpfn(d)) 54.39 + goto param_fail3; 54.40 + 54.41 + rc = 0; 54.42 + if ( !d->arch.shadow_bitmap ) 54.43 + goto param_fail3; 54.44 + 54.45 + for (pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++) 54.46 + if (pfn < d->arch.shadow_bitmap_size) 54.47 + set_bit(pfn, d->arch.shadow_bitmap); 54.48 + 54.49 + param_fail3: 54.50 + rcu_unlock_domain(d); 54.51 + break; 54.52 + } 54.53 + 54.54 default: 54.55 gdprintk(XENLOG_INFO, "Bad HVM op %ld.\n", op); 54.56 rc = -ENOSYS;
55.1 --- a/xen/arch/ia64/xen/mm.c Thu Jun 19 12:48:04 2008 +0900 55.2 +++ b/xen/arch/ia64/xen/mm.c Wed Jul 02 11:30:37 2008 +0900 55.3 @@ -207,7 +207,7 @@ alloc_dom_xen_and_dom_io(void) 55.4 * Any Xen-heap pages that we will allow to be mapped will have 55.5 * their domain field set to dom_xen. 55.6 */ 55.7 - dom_xen = alloc_domain(DOMID_XEN); 55.8 + dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0); 55.9 BUG_ON(dom_xen == NULL); 55.10 55.11 /* 55.12 @@ -215,7 +215,7 @@ alloc_dom_xen_and_dom_io(void) 55.13 * This domain owns I/O pages that are within the range of the page_info 55.14 * array. Mappings occur at the priv of the caller. 55.15 */ 55.16 - dom_io = alloc_domain(DOMID_IO); 55.17 + dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0); 55.18 BUG_ON(dom_io == NULL); 55.19 } 55.20 55.21 @@ -1553,7 +1553,7 @@ expose_p2m_init(void) 55.22 * Initialise our DOMID_P2M domain. 55.23 * This domain owns m2p table pages. 55.24 */ 55.25 - dom_p2m = alloc_domain(DOMID_P2M); 55.26 + dom_p2m = domain_create(DOMID_P2M, DOMCRF_dummy, 0); 55.27 BUG_ON(dom_p2m == NULL); 55.28 dom_p2m->max_pages = ~0U; 55.29
56.1 --- a/xen/arch/x86/acpi/cpufreq/Makefile Thu Jun 19 12:48:04 2008 +0900 56.2 +++ b/xen/arch/x86/acpi/cpufreq/Makefile Wed Jul 02 11:30:37 2008 +0900 56.3 @@ -1,3 +1,4 @@ 56.4 obj-y += cpufreq.o 56.5 obj-y += utility.o 56.6 obj-y += cpufreq_ondemand.o 56.7 +obj-y += powernow.o
57.1 --- a/xen/arch/x86/acpi/cpufreq/cpufreq.c Thu Jun 19 12:48:04 2008 +0900 57.2 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c Wed Jul 02 11:30:37 2008 +0900 57.3 @@ -47,6 +47,10 @@ 57.4 struct processor_pminfo processor_pminfo[NR_CPUS]; 57.5 struct cpufreq_policy xen_px_policy[NR_CPUS]; 57.6 57.7 +static cpumask_t *cpufreq_dom_pt; 57.8 +static cpumask_t cpufreq_dom_mask; 57.9 +static unsigned int cpufreq_dom_max; 57.10 + 57.11 enum { 57.12 UNDEFINED_CAPABLE = 0, 57.13 SYSTEM_INTEL_MSR_CAPABLE, 57.14 @@ -60,7 +64,6 @@ struct acpi_cpufreq_data { 57.15 struct processor_performance *acpi_data; 57.16 struct cpufreq_frequency_table *freq_table; 57.17 unsigned int max_freq; 57.18 - unsigned int resume; 57.19 unsigned int cpu_feature; 57.20 }; 57.21 57.22 @@ -328,14 +331,16 @@ static int acpi_cpufreq_target(struct cp 57.23 57.24 next_perf_state = data->freq_table[next_state].index; 57.25 if (perf->state == next_perf_state) { 57.26 - if (unlikely(data->resume)) { 57.27 - printk("xen_pminfo: @acpi_cpufreq_target, " 57.28 - "Called after resume, resetting to P%d\n", 57.29 + if (unlikely(policy->resume)) { 57.30 + printk(KERN_INFO "Called after resume, resetting to P%d\n", 57.31 next_perf_state); 57.32 - data->resume = 0; 57.33 + policy->resume = 0; 57.34 } 57.35 - else 57.36 + else { 57.37 + printk(KERN_INFO "Already at target state (P%d)\n", 57.38 + next_perf_state); 57.39 return 0; 57.40 + } 57.41 } 57.42 57.43 switch (data->cpu_feature) { 57.44 @@ -531,7 +536,7 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol 57.45 * the first call to ->target() should result in us actually 57.46 * writing something to the appropriate registers. 57.47 */ 57.48 - data->resume = 1; 57.49 + policy->resume = 1; 57.50 57.51 return result; 57.52 57.53 @@ -549,61 +554,101 @@ static struct cpufreq_driver acpi_cpufre 57.54 .init = acpi_cpufreq_cpu_init, 57.55 }; 57.56 57.57 -int acpi_cpufreq_init(void) 57.58 +void cpufreq_dom_exit(void) 57.59 { 57.60 - unsigned int i, ret = 0; 57.61 - unsigned int dom, max_dom = 0; 57.62 - cpumask_t *pt, dom_mask; 57.63 + cpufreq_dom_max = 0; 57.64 + cpus_clear(cpufreq_dom_mask); 57.65 + if (cpufreq_dom_pt) 57.66 + xfree(cpufreq_dom_pt); 57.67 +} 57.68 57.69 - cpus_clear(dom_mask); 57.70 +int cpufreq_dom_init(void) 57.71 +{ 57.72 + unsigned int i; 57.73 + 57.74 + cpufreq_dom_max = 0; 57.75 + cpus_clear(cpufreq_dom_mask); 57.76 57.77 for_each_online_cpu(i) { 57.78 - cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask); 57.79 - if (max_dom < processor_pminfo[i].perf.domain_info.domain) 57.80 - max_dom = processor_pminfo[i].perf.domain_info.domain; 57.81 + cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask); 57.82 + if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain) 57.83 + cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain; 57.84 } 57.85 - max_dom++; 57.86 + cpufreq_dom_max++; 57.87 57.88 - pt = xmalloc_array(cpumask_t, max_dom); 57.89 - if (!pt) 57.90 + cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max); 57.91 + if (!cpufreq_dom_pt) 57.92 return -ENOMEM; 57.93 - memset(pt, 0, max_dom * sizeof(cpumask_t)); 57.94 - 57.95 - /* get cpumask of each psd domain */ 57.96 - for_each_online_cpu(i) 57.97 - cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]); 57.98 + memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t)); 57.99 57.100 for_each_online_cpu(i) 57.101 - processor_pminfo[i].perf.shared_cpu_map = 57.102 - pt[processor_pminfo[i].perf.domain_info.domain]; 57.103 + cpu_set(i, cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]); 57.104 + 57.105 + for_each_online_cpu(i) 57.106 + processor_pminfo[i].perf.shared_cpu_map = 57.107 + cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]; 57.108 57.109 - cpufreq_driver = &acpi_cpufreq_driver; 57.110 + return 0; 57.111 +} 57.112 57.113 - /* setup cpufreq infrastructure */ 57.114 +static int cpufreq_cpu_init(void) 57.115 +{ 57.116 + int i, ret = 0; 57.117 + 57.118 for_each_online_cpu(i) { 57.119 xen_px_policy[i].cpu = i; 57.120 57.121 ret = px_statistic_init(i); 57.122 if (ret) 57.123 - goto out; 57.124 + return ret; 57.125 57.126 ret = acpi_cpufreq_cpu_init(&xen_px_policy[i]); 57.127 if (ret) 57.128 - goto out; 57.129 + return ret; 57.130 } 57.131 + return ret; 57.132 +} 57.133 57.134 - /* setup ondemand cpufreq */ 57.135 - for (dom=0; dom<max_dom; dom++) { 57.136 - if (!cpu_isset(dom, dom_mask)) 57.137 +int cpufreq_dom_dbs(unsigned int event) 57.138 +{ 57.139 + int cpu, dom, ret = 0; 57.140 + 57.141 + for (dom=0; dom<cpufreq_dom_max; dom++) { 57.142 + if (!cpu_isset(dom, cpufreq_dom_mask)) 57.143 continue; 57.144 - i = first_cpu(pt[dom]); 57.145 - ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START); 57.146 + cpu = first_cpu(cpufreq_dom_pt[dom]); 57.147 + ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event); 57.148 if (ret) 57.149 - goto out; 57.150 + return ret; 57.151 } 57.152 - 57.153 -out: 57.154 - xfree(pt); 57.155 - 57.156 return ret; 57.157 } 57.158 + 57.159 +int acpi_cpufreq_init(void) 57.160 +{ 57.161 + int ret = 0; 57.162 + 57.163 + /* setup cpumask of psd dom and shared cpu map of cpu */ 57.164 + ret = cpufreq_dom_init(); 57.165 + if (ret) 57.166 + goto err; 57.167 + 57.168 + /* setup cpufreq driver */ 57.169 + cpufreq_driver = &acpi_cpufreq_driver; 57.170 + 57.171 + /* setup cpufreq infrastructure */ 57.172 + ret = cpufreq_cpu_init(); 57.173 + if (ret) 57.174 + goto err; 57.175 + 57.176 + /* setup cpufreq dbs according to dom coordiation */ 57.177 + ret = cpufreq_dom_dbs(CPUFREQ_GOV_START); 57.178 + if (ret) 57.179 + goto err; 57.180 + 57.181 + return ret; 57.182 + 57.183 +err: 57.184 + cpufreq_dom_exit(); 57.185 + return ret; 57.186 +}
58.1 --- a/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c Thu Jun 19 12:48:04 2008 +0900 58.2 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c Wed Jul 02 11:30:37 2008 +0900 58.3 @@ -52,7 +52,7 @@ static struct dbs_tuners { 58.4 58.5 static struct timer dbs_timer[NR_CPUS]; 58.6 58.7 -static inline uint64_t get_cpu_idle_time(unsigned int cpu) 58.8 +inline uint64_t get_cpu_idle_time(unsigned int cpu) 58.9 { 58.10 uint64_t idle_ns; 58.11 struct vcpu *v; 58.12 @@ -79,6 +79,12 @@ static void dbs_check_cpu(struct cpu_dbs 58.13 return; 58.14 58.15 policy = this_dbs_info->cur_policy; 58.16 + 58.17 + if (unlikely(policy->resume)) { 58.18 + __cpufreq_driver_target(policy, policy->max,CPUFREQ_RELATION_H); 58.19 + return; 58.20 + } 58.21 + 58.22 cur_ns = NOW(); 58.23 total_ns = cur_ns - this_dbs_info->prev_cpu_wall; 58.24 this_dbs_info->prev_cpu_wall = NOW(); 58.25 @@ -217,8 +223,7 @@ int cpufreq_governor_dbs(struct cpufreq_ 58.26 break; 58.27 58.28 case CPUFREQ_GOV_STOP: 58.29 - if (this_dbs_info->enable) 58.30 - dbs_timer_exit(this_dbs_info); 58.31 + dbs_timer_exit(this_dbs_info); 58.32 dbs_enable--; 58.33 58.34 break; 58.35 @@ -233,5 +238,4 @@ int cpufreq_governor_dbs(struct cpufreq_ 58.36 break; 58.37 } 58.38 return 0; 58.39 -} 58.40 - 58.41 +}
59.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 59.2 +++ b/xen/arch/x86/acpi/cpufreq/powernow.c Wed Jul 02 11:30:37 2008 +0900 59.3 @@ -0,0 +1,305 @@ 59.4 +/* 59.5 + * powernow - AMD Architectural P-state Driver ($Revision: 1.4 $) 59.6 + * 59.7 + * Copyright (C) 2008 Mark Langsdorf <mark.langsdorf@amd.com> 59.8 + * 59.9 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 59.10 + * 59.11 + * This program is free software; you can redistribute it and/or modify 59.12 + * it under the terms of the GNU General Public License as published by 59.13 + * the Free Software Foundation; either version 2 of the License, or (at 59.14 + * your option) any later version. 59.15 + * 59.16 + * This program is distributed in the hope that it will be useful, but 59.17 + * WITHOUT ANY WARRANTY; without even the implied warranty of 59.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 59.19 + * General Public License for more details. 59.20 + * 59.21 + * You should have received a copy of the GNU General Public License along 59.22 + * with this program; if not, write to the Free Software Foundation, Inc., 59.23 + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 59.24 + * 59.25 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 59.26 + */ 59.27 + 59.28 +#include <xen/types.h> 59.29 +#include <xen/errno.h> 59.30 +#include <xen/delay.h> 59.31 +#include <xen/cpumask.h> 59.32 +#include <xen/timer.h> 59.33 +#include <xen/xmalloc.h> 59.34 +#include <asm/bug.h> 59.35 +#include <asm/msr.h> 59.36 +#include <asm/io.h> 59.37 +#include <asm/config.h> 59.38 +#include <asm/processor.h> 59.39 +#include <asm/percpu.h> 59.40 +#include <asm/cpufeature.h> 59.41 +#include <acpi/acpi.h> 59.42 +#include <acpi/cpufreq/cpufreq.h> 59.43 + 59.44 +#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 59.45 +#define USE_HW_PSTATE 0x00000080 59.46 +#define HW_PSTATE_MASK 0x00000007 59.47 +#define HW_PSTATE_VALID_MASK 0x80000000 59.48 +#define HW_PSTATE_MAX_MASK 0x000000f0 59.49 +#define HW_PSTATE_MAX_SHIFT 4 59.50 +#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */ 59.51 +#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */ 59.52 +#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */ 59.53 +#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */ 59.54 + 59.55 +extern struct processor_pminfo processor_pminfo[NR_CPUS]; 59.56 +extern struct cpufreq_policy xen_px_policy[NR_CPUS]; 59.57 + 59.58 +struct powernow_cpufreq_data { 59.59 + struct processor_performance *acpi_data; 59.60 + struct cpufreq_frequency_table *freq_table; 59.61 + unsigned int max_freq; 59.62 + unsigned int resume; 59.63 + unsigned int cpu_feature; 59.64 +}; 59.65 + 59.66 +static struct powernow_cpufreq_data *drv_data[NR_CPUS]; 59.67 + 59.68 +struct drv_cmd { 59.69 + unsigned int type; 59.70 + cpumask_t mask; 59.71 + u64 addr; 59.72 + u32 val; 59.73 +}; 59.74 + 59.75 +static void transition_pstate(void *drvcmd) 59.76 +{ 59.77 + struct drv_cmd *cmd; 59.78 + cmd = (struct drv_cmd *) drvcmd; 59.79 + 59.80 + wrmsr(MSR_PSTATE_CTRL, cmd->val, 0); 59.81 +} 59.82 + 59.83 +static int powernow_cpufreq_target(struct cpufreq_policy *policy, 59.84 + unsigned int target_freq, unsigned int relation) 59.85 +{ 59.86 + struct powernow_cpufreq_data *data = drv_data[policy->cpu]; 59.87 + struct processor_performance *perf; 59.88 + struct cpufreq_freqs freqs; 59.89 + cpumask_t online_policy_cpus; 59.90 + struct drv_cmd cmd; 59.91 + unsigned int next_state = 0; /* Index into freq_table */ 59.92 + unsigned int next_perf_state = 0; /* Index into perf table */ 59.93 + int result = 0; 59.94 + 59.95 + if (unlikely(data == NULL || 59.96 + data->acpi_data == NULL || data->freq_table == NULL)) { 59.97 + return -ENODEV; 59.98 + } 59.99 + 59.100 + perf = data->acpi_data; 59.101 + result = cpufreq_frequency_table_target(policy, 59.102 + data->freq_table, 59.103 + target_freq, 59.104 + relation, &next_state); 59.105 + if (unlikely(result)) 59.106 + return -ENODEV; 59.107 + 59.108 + online_policy_cpus = policy->cpus; 59.109 + 59.110 + next_perf_state = data->freq_table[next_state].index; 59.111 + if (perf->state == next_perf_state) { 59.112 + if (unlikely(data->resume)) 59.113 + data->resume = 0; 59.114 + else 59.115 + return 0; 59.116 + } 59.117 + 59.118 + cpus_clear(cmd.mask); 59.119 + 59.120 + if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) 59.121 + cmd.mask = online_policy_cpus; 59.122 + else 59.123 + cpu_set(policy->cpu, cmd.mask); 59.124 + 59.125 + freqs.old = perf->states[perf->state].core_frequency * 1000; 59.126 + freqs.new = data->freq_table[next_state].frequency; 59.127 + 59.128 + cmd.val = next_perf_state; 59.129 + 59.130 + on_selected_cpus( cmd.mask, transition_pstate, (void *) &cmd, 0, 0); 59.131 + 59.132 + perf->state = next_perf_state; 59.133 + policy->cur = freqs.new; 59.134 + 59.135 + return result; 59.136 +} 59.137 + 59.138 +static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy) 59.139 +{ 59.140 + unsigned int i; 59.141 + unsigned int valid_states = 0; 59.142 + unsigned int cpu = policy->cpu; 59.143 + struct powernow_cpufreq_data *data; 59.144 + unsigned int result = 0; 59.145 + struct processor_performance *perf; 59.146 + u32 max_hw_pstate, hi = 0, lo = 0; 59.147 + 59.148 + data = xmalloc(struct powernow_cpufreq_data); 59.149 + if (!data) 59.150 + return -ENOMEM; 59.151 + memset(data, 0, sizeof(struct powernow_cpufreq_data)); 59.152 + 59.153 + drv_data[cpu] = data; 59.154 + 59.155 + data->acpi_data = &processor_pminfo[cpu].perf; 59.156 + 59.157 + perf = data->acpi_data; 59.158 + policy->shared_type = perf->shared_type; 59.159 + 59.160 + /* 59.161 + * Will let policy->cpus know about dependency only when software 59.162 + * coordination is required. 59.163 + */ 59.164 + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || 59.165 + policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { 59.166 + policy->cpus = perf->shared_cpu_map; 59.167 + } else { 59.168 + policy->cpus = cpumask_of_cpu(cpu); 59.169 + } 59.170 + 59.171 + /* capability check */ 59.172 + if (perf->state_count <= 1) { 59.173 + printk("No P-States\n"); 59.174 + result = -ENODEV; 59.175 + goto err_unreg; 59.176 + } 59.177 + rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); 59.178 + max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; 59.179 + 59.180 + if (perf->control_register.space_id != perf->status_register.space_id) { 59.181 + result = -ENODEV; 59.182 + goto err_unreg; 59.183 + } 59.184 + 59.185 + data->freq_table = xmalloc_array(struct cpufreq_frequency_table, 59.186 + (perf->state_count+1)); 59.187 + if (!data->freq_table) { 59.188 + result = -ENOMEM; 59.189 + goto err_unreg; 59.190 + } 59.191 + 59.192 + /* detect transition latency */ 59.193 + policy->cpuinfo.transition_latency = 0; 59.194 + for (i=0; i<perf->state_count; i++) { 59.195 + if ((perf->states[i].transition_latency * 1000) > 59.196 + policy->cpuinfo.transition_latency) 59.197 + policy->cpuinfo.transition_latency = 59.198 + perf->states[i].transition_latency * 1000; 59.199 + } 59.200 + 59.201 + data->max_freq = perf->states[0].core_frequency * 1000; 59.202 + /* table init */ 59.203 + for (i=0; i<perf->state_count && i<max_hw_pstate; i++) { 59.204 + if (i>0 && perf->states[i].core_frequency >= 59.205 + data->freq_table[valid_states-1].frequency / 1000) 59.206 + continue; 59.207 + 59.208 + data->freq_table[valid_states].index = perf->states[i].control & HW_PSTATE_MASK; 59.209 + data->freq_table[valid_states].frequency = 59.210 + perf->states[i].core_frequency * 1000; 59.211 + valid_states++; 59.212 + } 59.213 + data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; 59.214 + perf->state = 0; 59.215 + 59.216 + result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); 59.217 + if (result) 59.218 + goto err_freqfree; 59.219 + 59.220 + /* 59.221 + * the first call to ->target() should result in us actually 59.222 + * writing something to the appropriate registers. 59.223 + */ 59.224 + data->resume = 1; 59.225 + 59.226 + policy->cur = data->freq_table[i].frequency; 59.227 + return result; 59.228 + 59.229 +err_freqfree: 59.230 + xfree(data->freq_table); 59.231 +err_unreg: 59.232 + xfree(data); 59.233 + drv_data[cpu] = NULL; 59.234 + 59.235 + return result; 59.236 +} 59.237 + 59.238 +static struct cpufreq_driver powernow_cpufreq_driver = { 59.239 + .target = powernow_cpufreq_target, 59.240 + .init = powernow_cpufreq_cpu_init, 59.241 +}; 59.242 + 59.243 +int powernow_cpufreq_init(void) 59.244 +{ 59.245 + unsigned int i, ret = 0; 59.246 + unsigned int dom, max_dom = 0; 59.247 + cpumask_t *pt, dom_mask; 59.248 + 59.249 + cpus_clear(dom_mask); 59.250 + 59.251 + for_each_online_cpu(i) { 59.252 + struct cpuinfo_x86 *c = &cpu_data[i]; 59.253 + if (c->x86_vendor != X86_VENDOR_AMD) 59.254 + ret = -ENODEV; 59.255 + else 59.256 + { 59.257 + u32 eax, ebx, ecx, edx; 59.258 + cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); 59.259 + if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE) 59.260 + ret = -ENODEV; 59.261 + } 59.262 + if (ret) 59.263 + return ret; 59.264 + cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask); 59.265 + if (max_dom < processor_pminfo[i].perf.domain_info.domain) 59.266 + max_dom = processor_pminfo[i].perf.domain_info.domain; 59.267 + } 59.268 + max_dom++; 59.269 + 59.270 + pt = xmalloc_array(cpumask_t, max_dom); 59.271 + if (!pt) 59.272 + return -ENOMEM; 59.273 + memset(pt, 0, max_dom * sizeof(cpumask_t)); 59.274 + 59.275 + /* get cpumask of each psd domain */ 59.276 + for_each_online_cpu(i) 59.277 + cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]); 59.278 + 59.279 + for_each_online_cpu(i) 59.280 + processor_pminfo[i].perf.shared_cpu_map = 59.281 + pt[processor_pminfo[i].perf.domain_info.domain]; 59.282 + 59.283 + cpufreq_driver = &powernow_cpufreq_driver; 59.284 + 59.285 + /* setup cpufreq infrastructure */ 59.286 + for_each_online_cpu(i) { 59.287 + xen_px_policy[i].cpu = i; 59.288 + 59.289 + ret = powernow_cpufreq_cpu_init(&xen_px_policy[i]); 59.290 + if (ret) 59.291 + goto cpufreq_init_out; 59.292 + } 59.293 + 59.294 + /* setup ondemand cpufreq */ 59.295 + for (dom=0; dom<max_dom; dom++) { 59.296 + if (!cpu_isset(dom, dom_mask)) 59.297 + continue; 59.298 + i = first_cpu(pt[dom]); 59.299 + ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START); 59.300 + if (ret) 59.301 + goto cpufreq_init_out; 59.302 + } 59.303 + 59.304 +cpufreq_init_out: 59.305 + xfree(pt); 59.306 + 59.307 + return ret; 59.308 +}
60.1 --- a/xen/arch/x86/acpi/cpufreq/utility.c Thu Jun 19 12:48:04 2008 +0900 60.2 +++ b/xen/arch/x86/acpi/cpufreq/utility.c Wed Jul 02 11:30:37 2008 +0900 60.3 @@ -37,6 +37,41 @@ struct cpufreq_driver *cpufreq_driver; 60.4 * Px STATISTIC INFO * 60.5 *********************************************************************/ 60.6 60.7 +void px_statistic_suspend(void) 60.8 +{ 60.9 + int cpu; 60.10 + uint64_t now; 60.11 + 60.12 + now = NOW(); 60.13 + 60.14 + for_each_online_cpu(cpu) { 60.15 + struct pm_px *pxpt = &px_statistic_data[cpu]; 60.16 + uint64_t total_idle_ns; 60.17 + uint64_t tmp_idle_ns; 60.18 + 60.19 + total_idle_ns = get_cpu_idle_time(cpu); 60.20 + tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall; 60.21 + 60.22 + pxpt->u.pt[pxpt->u.cur].residency += 60.23 + now - pxpt->prev_state_wall; 60.24 + pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns; 60.25 + } 60.26 +} 60.27 + 60.28 +void px_statistic_resume(void) 60.29 +{ 60.30 + int cpu; 60.31 + uint64_t now; 60.32 + 60.33 + now = NOW(); 60.34 + 60.35 + for_each_online_cpu(cpu) { 60.36 + struct pm_px *pxpt = &px_statistic_data[cpu]; 60.37 + pxpt->prev_state_wall = now; 60.38 + pxpt->prev_idle_wall = get_cpu_idle_time(cpu); 60.39 + } 60.40 +} 60.41 + 60.42 void px_statistic_update(cpumask_t cpumask, uint8_t from, uint8_t to) 60.43 { 60.44 uint32_t i; 60.45 @@ -47,15 +82,22 @@ void px_statistic_update(cpumask_t cpuma 60.46 for_each_cpu_mask(i, cpumask) { 60.47 struct pm_px *pxpt = &px_statistic_data[i]; 60.48 uint32_t statnum = processor_pminfo[i].perf.state_count; 60.49 + uint64_t total_idle_ns; 60.50 + uint64_t tmp_idle_ns; 60.51 + 60.52 + total_idle_ns = get_cpu_idle_time(i); 60.53 + tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall; 60.54 60.55 pxpt->u.last = from; 60.56 pxpt->u.cur = to; 60.57 pxpt->u.pt[to].count++; 60.58 pxpt->u.pt[from].residency += now - pxpt->prev_state_wall; 60.59 + pxpt->u.pt[from].residency -= tmp_idle_ns; 60.60 60.61 (*(pxpt->u.trans_pt + from*statnum + to))++; 60.62 60.63 pxpt->prev_state_wall = now; 60.64 + pxpt->prev_idle_wall = total_idle_ns; 60.65 } 60.66 } 60.67 60.68 @@ -87,6 +129,7 @@ int px_statistic_init(int cpuid) 60.69 pxpt->u.pt[i].freq = pmpt->perf.states[i].core_frequency; 60.70 60.71 pxpt->prev_state_wall = NOW(); 60.72 + pxpt->prev_idle_wall = get_cpu_idle_time(cpuid); 60.73 60.74 return 0; 60.75 } 60.76 @@ -107,6 +150,7 @@ void px_statistic_reset(int cpuid) 60.77 } 60.78 60.79 pxpt->prev_state_wall = NOW(); 60.80 + pxpt->prev_idle_wall = get_cpu_idle_time(cpuid); 60.81 } 60.82 60.83 60.84 @@ -242,3 +286,62 @@ int __cpufreq_driver_getavg(struct cpufr 60.85 60.86 return ret; 60.87 } 60.88 + 60.89 + 60.90 +/********************************************************************* 60.91 + * CPUFREQ SUSPEND/RESUME * 60.92 + *********************************************************************/ 60.93 + 60.94 +void cpufreq_suspend(void) 60.95 +{ 60.96 + int cpu; 60.97 + 60.98 + /* to protect the case when Px was controlled by dom0-kernel */ 60.99 + /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */ 60.100 + for_each_online_cpu(cpu) { 60.101 + struct processor_performance *perf = &processor_pminfo[cpu].perf; 60.102 + 60.103 + if (!perf->init) 60.104 + return; 60.105 + } 60.106 + 60.107 + cpufreq_dom_dbs(CPUFREQ_GOV_STOP); 60.108 + 60.109 + cpufreq_dom_exit(); 60.110 + 60.111 + px_statistic_suspend(); 60.112 +} 60.113 + 60.114 +int cpufreq_resume(void) 60.115 +{ 60.116 + int cpu, ret = 0; 60.117 + 60.118 + /* 1. to protect the case when Px was controlled by dom0-kernel */ 60.119 + /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */ 60.120 + /* 2. set state and resume flag to sync cpu to right state and freq */ 60.121 + for_each_online_cpu(cpu) { 60.122 + struct processor_performance *perf = &processor_pminfo[cpu].perf; 60.123 + struct cpufreq_policy *policy = &xen_px_policy[cpu]; 60.124 + 60.125 + if (!perf->init) 60.126 + goto err; 60.127 + perf->state = 0; 60.128 + policy->resume = 1; 60.129 + } 60.130 + 60.131 + px_statistic_resume(); 60.132 + 60.133 + ret = cpufreq_dom_init(); 60.134 + if (ret) 60.135 + goto err; 60.136 + 60.137 + ret = cpufreq_dom_dbs(CPUFREQ_GOV_START); 60.138 + if (ret) 60.139 + goto err; 60.140 + 60.141 + return ret; 60.142 + 60.143 +err: 60.144 + cpufreq_dom_exit(); 60.145 + return ret; 60.146 +}
61.1 --- a/xen/arch/x86/acpi/pmstat.c Thu Jun 19 12:48:04 2008 +0900 61.2 +++ b/xen/arch/x86/acpi/pmstat.c Wed Jul 02 11:30:37 2008 +0900 61.3 @@ -71,11 +71,18 @@ int do_get_pm_info(struct xen_sysctl_get 61.4 case PMSTAT_get_pxstat: 61.5 { 61.6 uint64_t now, ct; 61.7 + uint64_t total_idle_ns; 61.8 + uint64_t tmp_idle_ns; 61.9 + 61.10 + total_idle_ns = get_cpu_idle_time(op->cpuid); 61.11 + tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall; 61.12 61.13 now = NOW(); 61.14 pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.ppc; 61.15 pxpt->u.pt[pxpt->u.cur].residency += now - pxpt->prev_state_wall; 61.16 + pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns; 61.17 pxpt->prev_state_wall = now; 61.18 + pxpt->prev_idle_wall = total_idle_ns; 61.19 61.20 ct = pmpt->perf.state_count; 61.21 if ( copy_to_guest(op->u.getpx.trans_pt, pxpt->u.trans_pt, ct*ct) )
62.1 --- a/xen/arch/x86/acpi/power.c Thu Jun 19 12:48:04 2008 +0900 62.2 +++ b/xen/arch/x86/acpi/power.c Wed Jul 02 11:30:37 2008 +0900 62.3 @@ -27,7 +27,7 @@ 62.4 #include <public/platform.h> 62.5 #include <asm/tboot.h> 62.6 62.7 -#define pmprintk(_l, _f, _a...) printk(_l "<PM> " _f "\n", ## _a ) 62.8 +#include <acpi/cpufreq/cpufreq.h> 62.9 62.10 static char opt_acpi_sleep[20]; 62.11 string_param("acpi_sleep", opt_acpi_sleep); 62.12 @@ -124,10 +124,12 @@ static int enter_state(u32 state) 62.13 if ( !spin_trylock(&pm_lock) ) 62.14 return -EBUSY; 62.15 62.16 - pmprintk(XENLOG_INFO, "Preparing system for ACPI S%d state.", state); 62.17 + printk(XENLOG_INFO "Preparing system for ACPI S%d state.", state); 62.18 62.19 freeze_domains(); 62.20 62.21 + cpufreq_suspend(); 62.22 + 62.23 disable_nonboot_cpus(); 62.24 if ( num_online_cpus() != 1 ) 62.25 { 62.26 @@ -139,11 +141,14 @@ static int enter_state(u32 state) 62.27 62.28 acpi_sleep_prepare(state); 62.29 62.30 + console_start_sync(); 62.31 + printk("Entering ACPI S%d state.\n", state); 62.32 + 62.33 local_irq_save(flags); 62.34 62.35 if ( (error = device_power_down()) ) 62.36 { 62.37 - pmprintk(XENLOG_ERR, "Some devices failed to power down."); 62.38 + printk(XENLOG_ERR "Some devices failed to power down."); 62.39 goto done; 62.40 } 62.41 62.42 @@ -162,8 +167,6 @@ static int enter_state(u32 state) 62.43 break; 62.44 } 62.45 62.46 - pmprintk(XENLOG_DEBUG, "Back to C."); 62.47 - 62.48 /* Restore CR4 and EFER from cached values. */ 62.49 write_cr4(read_cr4()); 62.50 if ( cpu_has_efer ) 62.51 @@ -171,16 +174,18 @@ static int enter_state(u32 state) 62.52 62.53 device_power_up(); 62.54 62.55 - pmprintk(XENLOG_INFO, "Finishing wakeup from ACPI S%d state.", state); 62.56 + printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.", state); 62.57 62.58 done: 62.59 local_irq_restore(flags); 62.60 + console_end_sync(); 62.61 acpi_sleep_post(state); 62.62 if ( !hvm_cpu_up() ) 62.63 BUG(); 62.64 62.65 enable_cpu: 62.66 enable_nonboot_cpus(); 62.67 + cpufreq_resume(); 62.68 thaw_domains(); 62.69 spin_unlock(&pm_lock); 62.70 return error; 62.71 @@ -206,7 +211,7 @@ int acpi_enter_sleep(struct xenpf_enter_ 62.72 ((sleep->pm1a_cnt_val ^ sleep->pm1b_cnt_val) & 62.73 ACPI_BITMASK_SLEEP_ENABLE) ) 62.74 { 62.75 - pmprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting."); 62.76 + gdprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting."); 62.77 return -EINVAL; 62.78 } 62.79 62.80 @@ -278,7 +283,7 @@ acpi_status asmlinkage acpi_enter_sleep_ 62.81 if ( tboot_in_measured_env() ) 62.82 { 62.83 tboot_sleep(sleep_state); 62.84 - pmprintk(XENLOG_ERR, "TBOOT failed entering s3 state\n"); 62.85 + printk(XENLOG_ERR "TBOOT failed entering s3 state\n"); 62.86 return_ACPI_STATUS(AE_ERROR); 62.87 } 62.88 62.89 @@ -320,7 +325,7 @@ static int __init acpi_sleep_init(void) 62.90 p += strspn(p, ", \t"); 62.91 } 62.92 62.93 - printk(XENLOG_INFO "<PM> ACPI (supports"); 62.94 + printk(XENLOG_INFO "ACPI sleep modes:"); 62.95 for ( i = 0; i < ACPI_S_STATE_COUNT; i++ ) 62.96 { 62.97 if ( i == ACPI_STATE_S3 ) 62.98 @@ -331,7 +336,7 @@ static int __init acpi_sleep_init(void) 62.99 else 62.100 sleep_states[i] = 0; 62.101 } 62.102 - printk(")\n"); 62.103 + printk("\n"); 62.104 62.105 return 0; 62.106 }
63.1 --- a/xen/arch/x86/hvm/emulate.c Thu Jun 19 12:48:04 2008 +0900 63.2 +++ b/xen/arch/x86/hvm/emulate.c Wed Jul 02 11:30:37 2008 +0900 63.3 @@ -21,15 +21,33 @@ 63.4 63.5 static int hvmemul_do_io( 63.6 int is_mmio, paddr_t addr, unsigned long *reps, int size, 63.7 - paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val) 63.8 + paddr_t ram_gpa, int dir, int df, void *p_data) 63.9 { 63.10 + paddr_t value = ram_gpa; 63.11 + int value_is_ptr = (p_data == NULL); 63.12 struct vcpu *curr = current; 63.13 vcpu_iodata_t *vio = get_ioreq(curr); 63.14 ioreq_t *p = &vio->vp_ioreq; 63.15 int rc; 63.16 63.17 - /* Only retrieve the value from singleton (non-REP) reads. */ 63.18 - ASSERT((val == NULL) || ((dir == IOREQ_READ) && !value_is_ptr)); 63.19 + /* 63.20 + * Weird-sized accesses have undefined behaviour: we discard writes 63.21 + * and read all-ones. 63.22 + */ 63.23 + if ( unlikely((size > sizeof(long)) || (size & (size - 1))) ) 63.24 + { 63.25 + gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size); 63.26 + ASSERT(p_data != NULL); /* cannot happen with a REP prefix */ 63.27 + if ( dir == IOREQ_READ ) 63.28 + memset(p_data, ~0, size); 63.29 + return X86EMUL_UNHANDLEABLE; 63.30 + } 63.31 + 63.32 + if ( (p_data != NULL) && (dir == IOREQ_WRITE) ) 63.33 + { 63.34 + memcpy(&value, p_data, size); 63.35 + p_data = NULL; 63.36 + } 63.37 63.38 if ( is_mmio && !value_is_ptr ) 63.39 { 63.40 @@ -47,8 +65,7 @@ static int hvmemul_do_io( 63.41 unsigned int bytes = curr->arch.hvm_vcpu.mmio_large_read_bytes; 63.42 if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) ) 63.43 { 63.44 - *val = 0; 63.45 - memcpy(val, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa], 63.46 + memcpy(p_data, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa], 63.47 size); 63.48 return X86EMUL_OKAY; 63.49 } 63.50 @@ -61,7 +78,7 @@ static int hvmemul_do_io( 63.51 break; 63.52 case HVMIO_completed: 63.53 curr->arch.hvm_vcpu.io_state = HVMIO_none; 63.54 - if ( val == NULL ) 63.55 + if ( p_data == NULL ) 63.56 return X86EMUL_UNHANDLEABLE; 63.57 goto finish_access; 63.58 case HVMIO_dispatched: 63.59 @@ -82,7 +99,7 @@ static int hvmemul_do_io( 63.60 } 63.61 63.62 curr->arch.hvm_vcpu.io_state = 63.63 - (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion; 63.64 + (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion; 63.65 63.66 p->dir = dir; 63.67 p->data_is_ptr = value_is_ptr; 63.68 @@ -116,7 +133,7 @@ static int hvmemul_do_io( 63.69 break; 63.70 case X86EMUL_UNHANDLEABLE: 63.71 hvm_send_assist_req(curr); 63.72 - rc = (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY; 63.73 + rc = (p_data != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY; 63.74 break; 63.75 default: 63.76 BUG(); 63.77 @@ -126,8 +143,8 @@ static int hvmemul_do_io( 63.78 return rc; 63.79 63.80 finish_access: 63.81 - if ( val != NULL ) 63.82 - *val = curr->arch.hvm_vcpu.io_data; 63.83 + if ( p_data != NULL ) 63.84 + memcpy(p_data, &curr->arch.hvm_vcpu.io_data, size); 63.85 63.86 if ( is_mmio && !value_is_ptr ) 63.87 { 63.88 @@ -152,7 +169,7 @@ static int hvmemul_do_io( 63.89 sizeof(curr->arch.hvm_vcpu.mmio_large_read)) ) 63.90 { 63.91 memcpy(&curr->arch.hvm_vcpu.mmio_large_read[addr - pa], 63.92 - val, size); 63.93 + p_data, size); 63.94 curr->arch.hvm_vcpu.mmio_large_read_bytes += size; 63.95 } 63.96 } 63.97 @@ -163,18 +180,16 @@ static int hvmemul_do_io( 63.98 63.99 static int hvmemul_do_pio( 63.100 unsigned long port, unsigned long *reps, int size, 63.101 - paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val) 63.102 + paddr_t ram_gpa, int dir, int df, void *p_data) 63.103 { 63.104 - return hvmemul_do_io(0, port, reps, size, value, 63.105 - dir, df, value_is_ptr, val); 63.106 + return hvmemul_do_io(0, port, reps, size, ram_gpa, dir, df, p_data); 63.107 } 63.108 63.109 static int hvmemul_do_mmio( 63.110 paddr_t gpa, unsigned long *reps, int size, 63.111 - paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val) 63.112 + paddr_t ram_gpa, int dir, int df, void *p_data) 63.113 { 63.114 - return hvmemul_do_io(1, gpa, reps, size, value, 63.115 - dir, df, value_is_ptr, val); 63.116 + return hvmemul_do_io(1, gpa, reps, size, ram_gpa, dir, df, p_data); 63.117 } 63.118 63.119 /* 63.120 @@ -287,7 +302,7 @@ static int hvmemul_virtual_to_linear( 63.121 static int __hvmemul_read( 63.122 enum x86_segment seg, 63.123 unsigned long offset, 63.124 - unsigned long *val, 63.125 + void *p_data, 63.126 unsigned int bytes, 63.127 enum hvm_access_type access_type, 63.128 struct hvm_emulate_ctxt *hvmemul_ctxt) 63.129 @@ -303,8 +318,6 @@ static int __hvmemul_read( 63.130 if ( rc != X86EMUL_OKAY ) 63.131 return rc; 63.132 63.133 - *val = 0; 63.134 - 63.135 if ( unlikely(curr->arch.hvm_vcpu.mmio_gva == (addr & PAGE_MASK)) && 63.136 curr->arch.hvm_vcpu.mmio_gva ) 63.137 { 63.138 @@ -314,7 +327,7 @@ static int __hvmemul_read( 63.139 gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off); 63.140 if ( (off + bytes) <= PAGE_SIZE ) 63.141 return hvmemul_do_mmio(gpa, &reps, bytes, 0, 63.142 - IOREQ_READ, 0, 0, val); 63.143 + IOREQ_READ, 0, p_data); 63.144 } 63.145 63.146 if ( (seg != x86_seg_none) && 63.147 @@ -322,15 +335,13 @@ static int __hvmemul_read( 63.148 pfec |= PFEC_user_mode; 63.149 63.150 rc = ((access_type == hvm_access_insn_fetch) ? 63.151 - hvm_fetch_from_guest_virt(val, addr, bytes, pfec) : 63.152 - hvm_copy_from_guest_virt(val, addr, bytes, pfec)); 63.153 + hvm_fetch_from_guest_virt(p_data, addr, bytes, pfec) : 63.154 + hvm_copy_from_guest_virt(p_data, addr, bytes, pfec)); 63.155 if ( rc == HVMCOPY_bad_gva_to_gfn ) 63.156 return X86EMUL_EXCEPTION; 63.157 63.158 if ( rc == HVMCOPY_bad_gfn_to_mfn ) 63.159 { 63.160 - unsigned long reps = 1; 63.161 - 63.162 if ( access_type == hvm_access_insn_fetch ) 63.163 return X86EMUL_UNHANDLEABLE; 63.164 63.165 @@ -339,7 +350,7 @@ static int __hvmemul_read( 63.166 if ( rc != X86EMUL_OKAY ) 63.167 return rc; 63.168 63.169 - return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, 0, val); 63.170 + return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, p_data); 63.171 } 63.172 63.173 return X86EMUL_OKAY; 63.174 @@ -348,19 +359,19 @@ static int __hvmemul_read( 63.175 static int hvmemul_read( 63.176 enum x86_segment seg, 63.177 unsigned long offset, 63.178 - unsigned long *val, 63.179 + void *p_data, 63.180 unsigned int bytes, 63.181 struct x86_emulate_ctxt *ctxt) 63.182 { 63.183 return __hvmemul_read( 63.184 - seg, offset, val, bytes, hvm_access_read, 63.185 + seg, offset, p_data, bytes, hvm_access_read, 63.186 container_of(ctxt, struct hvm_emulate_ctxt, ctxt)); 63.187 } 63.188 63.189 static int hvmemul_insn_fetch( 63.190 enum x86_segment seg, 63.191 unsigned long offset, 63.192 - unsigned long *val, 63.193 + void *p_data, 63.194 unsigned int bytes, 63.195 struct x86_emulate_ctxt *ctxt) 63.196 { 63.197 @@ -371,19 +382,18 @@ static int hvmemul_insn_fetch( 63.198 /* Fall back if requested bytes are not in the prefetch cache. */ 63.199 if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) ) 63.200 return __hvmemul_read( 63.201 - seg, offset, val, bytes, 63.202 + seg, offset, p_data, bytes, 63.203 hvm_access_insn_fetch, hvmemul_ctxt); 63.204 63.205 /* Hit the cache. Simple memcpy. */ 63.206 - *val = 0; 63.207 - memcpy(val, &hvmemul_ctxt->insn_buf[insn_off], bytes); 63.208 + memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes); 63.209 return X86EMUL_OKAY; 63.210 } 63.211 63.212 static int hvmemul_write( 63.213 enum x86_segment seg, 63.214 unsigned long offset, 63.215 - unsigned long val, 63.216 + void *p_data, 63.217 unsigned int bytes, 63.218 struct x86_emulate_ctxt *ctxt) 63.219 { 63.220 @@ -406,29 +416,27 @@ static int hvmemul_write( 63.221 unsigned int off = addr & (PAGE_SIZE - 1); 63.222 gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off); 63.223 if ( (off + bytes) <= PAGE_SIZE ) 63.224 - return hvmemul_do_mmio(gpa, &reps, bytes, val, 63.225 - IOREQ_WRITE, 0, 0, NULL); 63.226 + return hvmemul_do_mmio(gpa, &reps, bytes, 0, 63.227 + IOREQ_WRITE, 0, p_data); 63.228 } 63.229 63.230 if ( (seg != x86_seg_none) && 63.231 (hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) ) 63.232 pfec |= PFEC_user_mode; 63.233 63.234 - rc = hvm_copy_to_guest_virt(addr, &val, bytes, pfec); 63.235 + rc = hvm_copy_to_guest_virt(addr, p_data, bytes, pfec); 63.236 if ( rc == HVMCOPY_bad_gva_to_gfn ) 63.237 return X86EMUL_EXCEPTION; 63.238 63.239 if ( rc == HVMCOPY_bad_gfn_to_mfn ) 63.240 { 63.241 - unsigned long reps = 1; 63.242 - 63.243 rc = hvmemul_linear_to_phys( 63.244 addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt); 63.245 if ( rc != X86EMUL_OKAY ) 63.246 return rc; 63.247 63.248 - return hvmemul_do_mmio(gpa, &reps, bytes, val, 63.249 - IOREQ_WRITE, 0, 0, NULL); 63.250 + return hvmemul_do_mmio(gpa, &reps, bytes, 0, 63.251 + IOREQ_WRITE, 0, p_data); 63.252 } 63.253 63.254 return X86EMUL_OKAY; 63.255 @@ -442,12 +450,8 @@ static int hvmemul_cmpxchg( 63.256 unsigned int bytes, 63.257 struct x86_emulate_ctxt *ctxt) 63.258 { 63.259 - unsigned long new = 0; 63.260 - if ( bytes > sizeof(new) ) 63.261 - return X86EMUL_UNHANDLEABLE; 63.262 - memcpy(&new, p_new, bytes); 63.263 /* Fix this in case the guest is really relying on r-m-w atomicity. */ 63.264 - return hvmemul_write(seg, offset, new, bytes, ctxt); 63.265 + return hvmemul_write(seg, offset, p_new, bytes, ctxt); 63.266 } 63.267 63.268 static int hvmemul_rep_ins( 63.269 @@ -480,7 +484,7 @@ static int hvmemul_rep_ins( 63.270 return rc; 63.271 63.272 return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ, 63.273 - !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); 63.274 + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); 63.275 } 63.276 63.277 static int hvmemul_rep_outs( 63.278 @@ -513,7 +517,7 @@ static int hvmemul_rep_outs( 63.279 return rc; 63.280 63.281 return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE, 63.282 - !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); 63.283 + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); 63.284 } 63.285 63.286 static int hvmemul_rep_movs( 63.287 @@ -563,14 +567,14 @@ static int hvmemul_rep_movs( 63.288 if ( !p2m_is_ram(p2mt) ) 63.289 return hvmemul_do_mmio( 63.290 sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ, 63.291 - !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); 63.292 + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); 63.293 63.294 (void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt); 63.295 if ( p2m_is_ram(p2mt) ) 63.296 return X86EMUL_UNHANDLEABLE; 63.297 return hvmemul_do_mmio( 63.298 dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE, 63.299 - !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); 63.300 + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); 63.301 } 63.302 63.303 static int hvmemul_read_segment( 63.304 @@ -607,7 +611,8 @@ static int hvmemul_read_io( 63.305 struct x86_emulate_ctxt *ctxt) 63.306 { 63.307 unsigned long reps = 1; 63.308 - return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, 0, val); 63.309 + *val = 0; 63.310 + return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, val); 63.311 } 63.312 63.313 static int hvmemul_write_io( 63.314 @@ -617,7 +622,7 @@ static int hvmemul_write_io( 63.315 struct x86_emulate_ctxt *ctxt) 63.316 { 63.317 unsigned long reps = 1; 63.318 - return hvmemul_do_pio(port, &reps, bytes, val, IOREQ_WRITE, 0, 0, NULL); 63.319 + return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_WRITE, 0, &val); 63.320 } 63.321 63.322 static int hvmemul_read_cr(
64.1 --- a/xen/arch/x86/hvm/hvm.c Thu Jun 19 12:48:04 2008 +0900 64.2 +++ b/xen/arch/x86/hvm/hvm.c Wed Jul 02 11:30:37 2008 +0900 64.3 @@ -2529,6 +2529,66 @@ long do_hvm_op(unsigned long op, XEN_GUE 64.4 break; 64.5 } 64.6 64.7 + case HVMOP_modified_memory: 64.8 + { 64.9 + struct xen_hvm_modified_memory a; 64.10 + struct domain *d; 64.11 + unsigned long pfn; 64.12 + 64.13 + if ( copy_from_guest(&a, arg, 1) ) 64.14 + return -EFAULT; 64.15 + 64.16 + if ( a.domid == DOMID_SELF ) 64.17 + { 64.18 + d = rcu_lock_current_domain(); 64.19 + } 64.20 + else 64.21 + { 64.22 + if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL ) 64.23 + return -ESRCH; 64.24 + if ( !IS_PRIV_FOR(current->domain, d) ) 64.25 + { 64.26 + rc = -EPERM; 64.27 + goto param_fail3; 64.28 + } 64.29 + } 64.30 + 64.31 + rc = -EINVAL; 64.32 + if ( !is_hvm_domain(d) ) 64.33 + goto param_fail3; 64.34 + 64.35 + rc = xsm_hvm_param(d, op); 64.36 + if ( rc ) 64.37 + goto param_fail3; 64.38 + 64.39 + rc = -EINVAL; 64.40 + if ( (a.first_pfn > domain_get_maximum_gpfn(d)) || 64.41 + ((a.first_pfn + a.nr - 1) < a.first_pfn) || 64.42 + ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) ) 64.43 + goto param_fail3; 64.44 + 64.45 + rc = 0; 64.46 + if ( !paging_mode_log_dirty(d) ) 64.47 + goto param_fail3; 64.48 + 64.49 + for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ ) 64.50 + { 64.51 + p2m_type_t t; 64.52 + mfn_t mfn = gfn_to_mfn(d, pfn, &t); 64.53 + if ( mfn_x(mfn) != INVALID_MFN ) 64.54 + { 64.55 + paging_mark_dirty(d, mfn_x(mfn)); 64.56 + /* These are most probably not page tables any more */ 64.57 + /* don't take a long time and don't die either */ 64.58 + sh_remove_shadows(d->vcpu[0], mfn, 1, 0); 64.59 + } 64.60 + } 64.61 + 64.62 + param_fail3: 64.63 + rcu_unlock_domain(d); 64.64 + break; 64.65 + } 64.66 + 64.67 default: 64.68 { 64.69 gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
65.1 --- a/xen/arch/x86/hvm/vmx/vmcs.c Thu Jun 19 12:48:04 2008 +0900 65.2 +++ b/xen/arch/x86/hvm/vmx/vmcs.c Wed Jul 02 11:30:37 2008 +0900 65.3 @@ -677,10 +677,11 @@ static int construct_vmcs(struct vcpu *v 65.4 return 0; 65.5 } 65.6 65.7 -int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val) 65.8 +int vmx_read_guest_msr(u32 msr, u64 *val) 65.9 { 65.10 - unsigned int i, msr_count = v->arch.hvm_vmx.msr_count; 65.11 - const struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area; 65.12 + struct vcpu *curr = current; 65.13 + unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count; 65.14 + const struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; 65.15 65.16 for ( i = 0; i < msr_count; i++ ) 65.17 { 65.18 @@ -694,10 +695,11 @@ int vmx_read_guest_msr(struct vcpu *v, u 65.19 return -ESRCH; 65.20 } 65.21 65.22 -int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val) 65.23 +int vmx_write_guest_msr(u32 msr, u64 val) 65.24 { 65.25 - unsigned int i, msr_count = v->arch.hvm_vmx.msr_count; 65.26 - struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area; 65.27 + struct vcpu *curr = current; 65.28 + unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count; 65.29 + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; 65.30 65.31 for ( i = 0; i < msr_count; i++ ) 65.32 { 65.33 @@ -711,10 +713,20 @@ int vmx_write_guest_msr(struct vcpu *v, 65.34 return -ESRCH; 65.35 } 65.36 65.37 -int vmx_add_guest_msr(struct vcpu *v, u32 msr) 65.38 +int vmx_add_guest_msr(u32 msr) 65.39 { 65.40 - unsigned int i, msr_count = v->arch.hvm_vmx.msr_count; 65.41 - struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area; 65.42 + struct vcpu *curr = current; 65.43 + unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count; 65.44 + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; 65.45 + 65.46 + if ( msr_area == NULL ) 65.47 + { 65.48 + if ( (msr_area = alloc_xenheap_page()) == NULL ) 65.49 + return -ENOMEM; 65.50 + curr->arch.hvm_vmx.msr_area = msr_area; 65.51 + __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area)); 65.52 + __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); 65.53 + } 65.54 65.55 for ( i = 0; i < msr_count; i++ ) 65.56 if ( msr_area[i].index == msr ) 65.57 @@ -723,29 +735,29 @@ int vmx_add_guest_msr(struct vcpu *v, u3 65.58 if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) ) 65.59 return -ENOSPC; 65.60 65.61 - if ( msr_area == NULL ) 65.62 - { 65.63 - if ( (msr_area = alloc_xenheap_page()) == NULL ) 65.64 - return -ENOMEM; 65.65 - v->arch.hvm_vmx.msr_area = msr_area; 65.66 - __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area)); 65.67 - __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); 65.68 - } 65.69 - 65.70 msr_area[msr_count].index = msr; 65.71 msr_area[msr_count].mbz = 0; 65.72 msr_area[msr_count].data = 0; 65.73 - v->arch.hvm_vmx.msr_count = ++msr_count; 65.74 + curr->arch.hvm_vmx.msr_count = ++msr_count; 65.75 __vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count); 65.76 __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count); 65.77 65.78 return 0; 65.79 } 65.80 65.81 -int vmx_add_host_load_msr(struct vcpu *v, u32 msr) 65.82 +int vmx_add_host_load_msr(u32 msr) 65.83 { 65.84 - unsigned int i, msr_count = v->arch.hvm_vmx.host_msr_count; 65.85 - struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.host_msr_area; 65.86 + struct vcpu *curr = current; 65.87 + unsigned int i, msr_count = curr->arch.hvm_vmx.host_msr_count; 65.88 + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.host_msr_area; 65.89 + 65.90 + if ( msr_area == NULL ) 65.91 + { 65.92 + if ( (msr_area = alloc_xenheap_page()) == NULL ) 65.93 + return -ENOMEM; 65.94 + curr->arch.hvm_vmx.host_msr_area = msr_area; 65.95 + __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); 65.96 + } 65.97 65.98 for ( i = 0; i < msr_count; i++ ) 65.99 if ( msr_area[i].index == msr ) 65.100 @@ -754,18 +766,10 @@ int vmx_add_host_load_msr(struct vcpu *v 65.101 if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) ) 65.102 return -ENOSPC; 65.103 65.104 - if ( msr_area == NULL ) 65.105 - { 65.106 - if ( (msr_area = alloc_xenheap_page()) == NULL ) 65.107 - return -ENOMEM; 65.108 - v->arch.hvm_vmx.host_msr_area = msr_area; 65.109 - __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); 65.110 - } 65.111 - 65.112 msr_area[msr_count].index = msr; 65.113 msr_area[msr_count].mbz = 0; 65.114 rdmsrl(msr, msr_area[msr_count].data); 65.115 - v->arch.hvm_vmx.host_msr_count = ++msr_count; 65.116 + curr->arch.hvm_vmx.host_msr_count = ++msr_count; 65.117 __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count); 65.118 65.119 return 0; 65.120 @@ -776,21 +780,17 @@ int vmx_create_vmcs(struct vcpu *v) 65.121 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx; 65.122 int rc; 65.123 65.124 - if ( arch_vmx->vmcs == NULL ) 65.125 - { 65.126 - if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL ) 65.127 - return -ENOMEM; 65.128 + if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL ) 65.129 + return -ENOMEM; 65.130 65.131 - INIT_LIST_HEAD(&arch_vmx->active_list); 65.132 - __vmpclear(virt_to_maddr(arch_vmx->vmcs)); 65.133 - arch_vmx->active_cpu = -1; 65.134 - arch_vmx->launched = 0; 65.135 - } 65.136 + INIT_LIST_HEAD(&arch_vmx->active_list); 65.137 + __vmpclear(virt_to_maddr(arch_vmx->vmcs)); 65.138 + arch_vmx->active_cpu = -1; 65.139 + arch_vmx->launched = 0; 65.140 65.141 if ( (rc = construct_vmcs(v)) != 0 ) 65.142 { 65.143 vmx_free_vmcs(arch_vmx->vmcs); 65.144 - arch_vmx->vmcs = NULL; 65.145 return rc; 65.146 } 65.147 65.148 @@ -801,13 +801,13 @@ void vmx_destroy_vmcs(struct vcpu *v) 65.149 { 65.150 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx; 65.151 65.152 - if ( arch_vmx->vmcs == NULL ) 65.153 - return; 65.154 - 65.155 vmx_clear_vmcs(v); 65.156 65.157 vmx_free_vmcs(arch_vmx->vmcs); 65.158 - arch_vmx->vmcs = NULL; 65.159 + 65.160 + free_xenheap_page(v->arch.hvm_vmx.host_msr_area); 65.161 + free_xenheap_page(v->arch.hvm_vmx.msr_area); 65.162 + free_xenheap_page(v->arch.hvm_vmx.msr_bitmap); 65.163 } 65.164 65.165 void vm_launch_fail(void)
66.1 --- a/xen/arch/x86/hvm/vmx/vmx.c Thu Jun 19 12:48:04 2008 +0900 66.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Jul 02 11:30:37 2008 +0900 66.3 @@ -1523,7 +1523,8 @@ static int vmx_cr_access(unsigned long e 66.4 break; 66.5 case VMX_CONTROL_REG_ACCESS_TYPE_LMSW: 66.6 value = v->arch.hvm_vcpu.guest_cr[0]; 66.7 - value = (value & ~0xFFFF) | ((exit_qualification >> 16) & 0xFFFF); 66.8 + /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */ 66.9 + value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf); 66.10 HVMTRACE_LONG_1D(LMSW, current, value); 66.11 return !hvm_set_cr0(value); 66.12 default: 66.13 @@ -1655,7 +1656,7 @@ static int vmx_msr_read_intercept(struct 66.14 goto done; 66.15 } 66.16 66.17 - if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 ) 66.18 + if ( vmx_read_guest_msr(ecx, &msr_content) == 0 ) 66.19 break; 66.20 66.21 if ( is_last_branch_msr(ecx) ) 66.22 @@ -1817,12 +1818,12 @@ static int vmx_msr_write_intercept(struc 66.23 66.24 for ( ; (rc == 0) && lbr->count; lbr++ ) 66.25 for ( i = 0; (rc == 0) && (i < lbr->count); i++ ) 66.26 - if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 ) 66.27 + if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 ) 66.28 vmx_disable_intercept_for_msr(v, lbr->base + i); 66.29 } 66.30 66.31 if ( (rc < 0) || 66.32 - (vmx_add_host_load_msr(v, ecx) < 0) ) 66.33 + (vmx_add_host_load_msr(ecx) < 0) ) 66.34 vmx_inject_hw_exception(v, TRAP_machine_check, 0); 66.35 else 66.36 { 66.37 @@ -1842,7 +1843,7 @@ static int vmx_msr_write_intercept(struc 66.38 switch ( long_mode_do_msr_write(regs) ) 66.39 { 66.40 case HNDL_unhandled: 66.41 - if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) && 66.42 + if ( (vmx_write_guest_msr(ecx, msr_content) != 0) && 66.43 !is_last_branch_msr(ecx) ) 66.44 wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx); 66.45 break;
67.1 --- a/xen/arch/x86/hvm/vmx/vpmu_core2.c Thu Jun 19 12:48:04 2008 +0900 67.2 +++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c Wed Jul 02 11:30:37 2008 +0900 67.3 @@ -219,12 +219,12 @@ static int core2_vpmu_alloc_resource(str 67.4 return 0; 67.5 67.6 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 67.7 - if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) ) 67.8 + if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) 67.9 return 0; 67.10 67.11 - if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) ) 67.12 + if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) 67.13 return 0; 67.14 - vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, -1ULL); 67.15 + vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, -1ULL); 67.16 67.17 pmu_enable = xmalloc_bytes(sizeof(struct core2_pmu_enable) + 67.18 (core2_get_pmc_count()-1)*sizeof(char)); 67.19 @@ -347,7 +347,7 @@ static int core2_vpmu_do_wrmsr(struct cp 67.20 break; 67.21 case MSR_CORE_PERF_FIXED_CTR_CTRL: 67.22 non_global_ctrl = msr_content; 67.23 - vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); 67.24 + vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); 67.25 global_ctrl >>= 32; 67.26 for ( i = 0; i < 3; i++ ) 67.27 { 67.28 @@ -359,7 +359,7 @@ static int core2_vpmu_do_wrmsr(struct cp 67.29 break; 67.30 default: 67.31 tmp = ecx - MSR_P6_EVNTSEL0; 67.32 - vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); 67.33 + vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); 67.34 if ( tmp >= 0 && tmp < core2_get_pmc_count() ) 67.35 core2_vpmu_cxt->pmu_enable->arch_pmc_enable[tmp] = 67.36 (global_ctrl >> tmp) & (msr_content >> 22) & 1; 67.37 @@ -385,7 +385,7 @@ static int core2_vpmu_do_wrmsr(struct cp 67.38 if ( type != MSR_TYPE_GLOBAL ) 67.39 wrmsrl(ecx, msr_content); 67.40 else 67.41 - vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content); 67.42 + vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); 67.43 67.44 return 1; 67.45 } 67.46 @@ -410,7 +410,7 @@ static int core2_vpmu_do_rdmsr(struct cp 67.47 msr_content = core2_vpmu_cxt->global_ovf_status; 67.48 break; 67.49 case MSR_CORE_PERF_GLOBAL_CTRL: 67.50 - vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &msr_content); 67.51 + vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &msr_content); 67.52 break; 67.53 default: 67.54 rdmsrl(regs->ecx, msr_content);
68.1 --- a/xen/arch/x86/mm.c Thu Jun 19 12:48:04 2008 +0900 68.2 +++ b/xen/arch/x86/mm.c Wed Jul 02 11:30:37 2008 +0900 68.3 @@ -219,7 +219,7 @@ void __init arch_init_memory(void) 68.4 * Any Xen-heap pages that we will allow to be mapped will have 68.5 * their domain field set to dom_xen. 68.6 */ 68.7 - dom_xen = alloc_domain(DOMID_XEN); 68.8 + dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0); 68.9 BUG_ON(dom_xen == NULL); 68.10 68.11 /* 68.12 @@ -227,7 +227,7 @@ void __init arch_init_memory(void) 68.13 * This domain owns I/O pages that are within the range of the page_info 68.14 * array. Mappings occur at the priv of the caller. 68.15 */ 68.16 - dom_io = alloc_domain(DOMID_IO); 68.17 + dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0); 68.18 BUG_ON(dom_io == NULL); 68.19 68.20 /* First 1MB of RAM is historically marked as I/O. */ 68.21 @@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page 68.22 { 68.23 struct domain *d = page_get_owner(page); 68.24 68.25 - /* Never allow a shadowed frame to go from type count 0 to 1 */ 68.26 - if ( d && shadow_mode_enabled(d) ) 68.27 - shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page))); 68.28 + /* Normally we should never let a page go from type count 0 68.29 + * to type count 1 when it is shadowed. One exception: 68.30 + * out-of-sync shadowed pages are allowed to become 68.31 + * writeable. */ 68.32 + if ( d && shadow_mode_enabled(d) 68.33 + && (page->count_info & PGC_page_table) 68.34 + && !((page->shadow_flags & (1u<<29)) 68.35 + && type == PGT_writable_page) ) 68.36 + shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page))); 68.37 68.38 ASSERT(!(x & PGT_pae_xen_l2)); 68.39 if ( (x & PGT_type_mask) != type ) 68.40 @@ -3533,15 +3539,14 @@ struct ptwr_emulate_ctxt { 68.41 static int ptwr_emulated_read( 68.42 enum x86_segment seg, 68.43 unsigned long offset, 68.44 - unsigned long *val, 68.45 + void *p_data, 68.46 unsigned int bytes, 68.47 struct x86_emulate_ctxt *ctxt) 68.48 { 68.49 unsigned int rc; 68.50 unsigned long addr = offset; 68.51 68.52 - *val = 0; 68.53 - if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 ) 68.54 + if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 ) 68.55 { 68.56 propagate_page_fault(addr + bytes - rc, 0); /* read fault */ 68.57 return X86EMUL_EXCEPTION; 68.58 @@ -3568,7 +3573,7 @@ static int ptwr_emulated_update( 68.59 /* Only allow naturally-aligned stores within the original %cr2 page. */ 68.60 if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) ) 68.61 { 68.62 - MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)", 68.63 + MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)", 68.64 ptwr_ctxt->cr2, addr, bytes); 68.65 return X86EMUL_UNHANDLEABLE; 68.66 } 68.67 @@ -3676,10 +3681,21 @@ static int ptwr_emulated_update( 68.68 static int ptwr_emulated_write( 68.69 enum x86_segment seg, 68.70 unsigned long offset, 68.71 - unsigned long val, 68.72 + void *p_data, 68.73 unsigned int bytes, 68.74 struct x86_emulate_ctxt *ctxt) 68.75 { 68.76 + paddr_t val = 0; 68.77 + 68.78 + if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) ) 68.79 + { 68.80 + MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)", 68.81 + offset, bytes); 68.82 + return X86EMUL_UNHANDLEABLE; 68.83 + } 68.84 + 68.85 + memcpy(&val, p_data, bytes); 68.86 + 68.87 return ptwr_emulated_update( 68.88 offset, 0, val, bytes, 0, 68.89 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt)); 68.90 @@ -3694,10 +3710,17 @@ static int ptwr_emulated_cmpxchg( 68.91 struct x86_emulate_ctxt *ctxt) 68.92 { 68.93 paddr_t old = 0, new = 0; 68.94 - if ( bytes > sizeof(paddr_t) ) 68.95 + 68.96 + if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) ) 68.97 + { 68.98 + MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)", 68.99 + offset, bytes); 68.100 return X86EMUL_UNHANDLEABLE; 68.101 + } 68.102 + 68.103 memcpy(&old, p_old, bytes); 68.104 memcpy(&new, p_new, bytes); 68.105 + 68.106 return ptwr_emulated_update( 68.107 offset, old, new, bytes, 1, 68.108 container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
69.1 --- a/xen/arch/x86/mm/shadow/common.c Thu Jun 19 12:48:04 2008 +0900 69.2 +++ b/xen/arch/x86/mm/shadow/common.c Wed Jul 02 11:30:37 2008 +0900 69.3 @@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d 69.4 /* Use shadow pagetables for log-dirty support */ 69.5 paging_log_dirty_init(d, shadow_enable_log_dirty, 69.6 shadow_disable_log_dirty, shadow_clean_dirty_bitmap); 69.7 + 69.8 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.9 + d->arch.paging.shadow.oos_active = 0; 69.10 +#endif 69.11 } 69.12 69.13 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important 69.14 @@ -64,6 +68,16 @@ void shadow_domain_init(struct domain *d 69.15 */ 69.16 void shadow_vcpu_init(struct vcpu *v) 69.17 { 69.18 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.19 + int i; 69.20 + 69.21 + for ( i = 0; i < SHADOW_OOS_PAGES; i++ ) 69.22 + { 69.23 + v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN); 69.24 + v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN); 69.25 + } 69.26 +#endif 69.27 + 69.28 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3); 69.29 } 69.30 69.31 @@ -131,7 +145,7 @@ static int hvm_translate_linear_addr( 69.32 static int 69.33 hvm_read(enum x86_segment seg, 69.34 unsigned long offset, 69.35 - unsigned long *val, 69.36 + void *p_data, 69.37 unsigned int bytes, 69.38 enum hvm_access_type access_type, 69.39 struct sh_emulate_ctxt *sh_ctxt) 69.40 @@ -144,12 +158,10 @@ hvm_read(enum x86_segment seg, 69.41 if ( rc ) 69.42 return rc; 69.43 69.44 - *val = 0; 69.45 - 69.46 if ( access_type == hvm_access_insn_fetch ) 69.47 - rc = hvm_fetch_from_guest_virt(val, addr, bytes, 0); 69.48 + rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0); 69.49 else 69.50 - rc = hvm_copy_from_guest_virt(val, addr, bytes, 0); 69.51 + rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0); 69.52 69.53 switch ( rc ) 69.54 { 69.55 @@ -167,20 +179,20 @@ hvm_read(enum x86_segment seg, 69.56 static int 69.57 hvm_emulate_read(enum x86_segment seg, 69.58 unsigned long offset, 69.59 - unsigned long *val, 69.60 + void *p_data, 69.61 unsigned int bytes, 69.62 struct x86_emulate_ctxt *ctxt) 69.63 { 69.64 if ( !is_x86_user_segment(seg) ) 69.65 return X86EMUL_UNHANDLEABLE; 69.66 - return hvm_read(seg, offset, val, bytes, hvm_access_read, 69.67 + return hvm_read(seg, offset, p_data, bytes, hvm_access_read, 69.68 container_of(ctxt, struct sh_emulate_ctxt, ctxt)); 69.69 } 69.70 69.71 static int 69.72 hvm_emulate_insn_fetch(enum x86_segment seg, 69.73 unsigned long offset, 69.74 - unsigned long *val, 69.75 + void *p_data, 69.76 unsigned int bytes, 69.77 struct x86_emulate_ctxt *ctxt) 69.78 { 69.79 @@ -192,19 +204,18 @@ hvm_emulate_insn_fetch(enum x86_segment 69.80 69.81 /* Fall back if requested bytes are not in the prefetch cache. */ 69.82 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) ) 69.83 - return hvm_read(seg, offset, val, bytes, 69.84 + return hvm_read(seg, offset, p_data, bytes, 69.85 hvm_access_insn_fetch, sh_ctxt); 69.86 69.87 /* Hit the cache. Simple memcpy. */ 69.88 - *val = 0; 69.89 - memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes); 69.90 + memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes); 69.91 return X86EMUL_OKAY; 69.92 } 69.93 69.94 static int 69.95 hvm_emulate_write(enum x86_segment seg, 69.96 unsigned long offset, 69.97 - unsigned long val, 69.98 + void *p_data, 69.99 unsigned int bytes, 69.100 struct x86_emulate_ctxt *ctxt) 69.101 { 69.102 @@ -227,7 +238,7 @@ hvm_emulate_write(enum x86_segment seg, 69.103 return rc; 69.104 69.105 return v->arch.paging.mode->shadow.x86_emulate_write( 69.106 - v, addr, &val, bytes, sh_ctxt); 69.107 + v, addr, p_data, bytes, sh_ctxt); 69.108 } 69.109 69.110 static int 69.111 @@ -279,7 +290,7 @@ static struct x86_emulate_ops hvm_shadow 69.112 static int 69.113 pv_emulate_read(enum x86_segment seg, 69.114 unsigned long offset, 69.115 - unsigned long *val, 69.116 + void *p_data, 69.117 unsigned int bytes, 69.118 struct x86_emulate_ctxt *ctxt) 69.119 { 69.120 @@ -288,8 +299,7 @@ pv_emulate_read(enum x86_segment seg, 69.121 if ( !is_x86_user_segment(seg) ) 69.122 return X86EMUL_UNHANDLEABLE; 69.123 69.124 - *val = 0; 69.125 - if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 ) 69.126 + if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 ) 69.127 { 69.128 propagate_page_fault(offset + bytes - rc, 0); /* read fault */ 69.129 return X86EMUL_EXCEPTION; 69.130 @@ -301,7 +311,7 @@ pv_emulate_read(enum x86_segment seg, 69.131 static int 69.132 pv_emulate_write(enum x86_segment seg, 69.133 unsigned long offset, 69.134 - unsigned long val, 69.135 + void *p_data, 69.136 unsigned int bytes, 69.137 struct x86_emulate_ctxt *ctxt) 69.138 { 69.139 @@ -311,7 +321,7 @@ pv_emulate_write(enum x86_segment seg, 69.140 if ( !is_x86_user_segment(seg) ) 69.141 return X86EMUL_UNHANDLEABLE; 69.142 return v->arch.paging.mode->shadow.x86_emulate_write( 69.143 - v, offset, &val, bytes, sh_ctxt); 69.144 + v, offset, p_data, bytes, sh_ctxt); 69.145 } 69.146 69.147 static int 69.148 @@ -427,6 +437,585 @@ void shadow_continue_emulation(struct sh 69.149 } 69.150 } 69.151 } 69.152 + 69.153 + 69.154 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.155 +/**************************************************************************/ 69.156 +/* Out-of-sync shadows. */ 69.157 + 69.158 +/* From time to time, we let a shadowed pagetable page go out of sync 69.159 + * with its shadow: the guest is allowed to write directly to the page, 69.160 + * and those writes are not synchronously reflected in the shadow. 69.161 + * This lets us avoid many emulations if the guest is writing a lot to a 69.162 + * pagetable, but it relaxes a pretty important invariant in the shadow 69.163 + * pagetable design. Therefore, some rules: 69.164 + * 69.165 + * 1. Only L1 pagetables may go out of sync: any page that is shadowed 69.166 + * at at higher level must be synchronously updated. This makes 69.167 + * using linear shadow pagetables much less dangerous. 69.168 + * That means that: (a) unsyncing code needs to check for higher-level 69.169 + * shadows, and (b) promotion code needs to resync. 69.170 + * 69.171 + * 2. All shadow operations on a guest page require the page to be brought 69.172 + * back into sync before proceeding. This must be done under the 69.173 + * shadow lock so that the page is guaranteed to remain synced until 69.174 + * the operation completes. 69.175 + * 69.176 + * Exceptions to this rule: the pagefault and invlpg handlers may 69.177 + * update only one entry on an out-of-sync page without resyncing it. 69.178 + * 69.179 + * 3. Operations on shadows that do not start from a guest page need to 69.180 + * be aware that they may be handling an out-of-sync shadow. 69.181 + * 69.182 + * 4. Operations that do not normally take the shadow lock (fast-path 69.183 + * #PF handler, INVLPG) must fall back to a locking, syncing version 69.184 + * if they see an out-of-sync table. 69.185 + * 69.186 + * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG) 69.187 + * must explicitly resync all relevant pages or update their 69.188 + * shadows. 69.189 + * 69.190 + * Currently out-of-sync pages are listed in a simple open-addressed 69.191 + * hash table with a second chance (must resist temptation to radically 69.192 + * over-engineer hash tables...) The virtual address of the access 69.193 + * which caused us to unsync the page is also kept in the hash table, as 69.194 + * a hint for finding the writable mappings later. 69.195 + * 69.196 + * We keep a hash per vcpu, because we want as much as possible to do 69.197 + * the re-sync on the save vcpu we did the unsync on, so the VA hint 69.198 + * will be valid. 69.199 + */ 69.200 + 69.201 + 69.202 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL 69.203 +static void sh_oos_audit(struct domain *d) 69.204 +{ 69.205 + int idx, expected_idx, expected_idx_alt; 69.206 + struct page_info *pg; 69.207 + struct vcpu *v; 69.208 + 69.209 + for_each_vcpu(d, v) 69.210 + { 69.211 + for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 69.212 + { 69.213 + mfn_t *oos = v->arch.paging.shadow.oos; 69.214 + if ( !mfn_valid(oos[idx]) ) 69.215 + continue; 69.216 + 69.217 + expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES; 69.218 + expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES); 69.219 + if ( idx != expected_idx && idx != expected_idx_alt ) 69.220 + { 69.221 + printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n", 69.222 + __func__, idx, mfn_x(oos[idx]), 69.223 + expected_idx, expected_idx_alt); 69.224 + BUG(); 69.225 + } 69.226 + pg = mfn_to_page(oos[idx]); 69.227 + if ( !(pg->count_info & PGC_page_table) ) 69.228 + { 69.229 + printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n", 69.230 + __func__, idx, mfn_x(oos[idx]), pg->count_info); 69.231 + BUG(); 69.232 + } 69.233 + if ( !(pg->shadow_flags & SHF_out_of_sync) ) 69.234 + { 69.235 + printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n", 69.236 + __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); 69.237 + BUG(); 69.238 + } 69.239 + if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) ) 69.240 + { 69.241 + printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n", 69.242 + __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); 69.243 + BUG(); 69.244 + } 69.245 + } 69.246 + } 69.247 +} 69.248 +#endif 69.249 + 69.250 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES 69.251 +void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) 69.252 +{ 69.253 + int idx; 69.254 + struct vcpu *v; 69.255 + mfn_t *oos; 69.256 + 69.257 + ASSERT(mfn_is_out_of_sync(gmfn)); 69.258 + 69.259 + for_each_vcpu(d, v) 69.260 + { 69.261 + oos = v->arch.paging.shadow.oos; 69.262 + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; 69.263 + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) 69.264 + idx = (idx + 1) % SHADOW_OOS_PAGES; 69.265 + 69.266 + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) 69.267 + return; 69.268 + } 69.269 + 69.270 + SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn)); 69.271 + BUG(); 69.272 +} 69.273 +#endif 69.274 + 69.275 +/* Update the shadow, but keep the page out of sync. */ 69.276 +static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn) 69.277 +{ 69.278 + struct page_info *pg = mfn_to_page(gmfn); 69.279 + 69.280 + ASSERT(mfn_valid(gmfn)); 69.281 + ASSERT(page_is_out_of_sync(pg)); 69.282 + 69.283 + /* Call out to the appropriate per-mode resyncing function */ 69.284 + if ( pg->shadow_flags & SHF_L1_32 ) 69.285 + SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn); 69.286 + else if ( pg->shadow_flags & SHF_L1_PAE ) 69.287 + SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn); 69.288 +#if CONFIG_PAGING_LEVELS >= 4 69.289 + else if ( pg->shadow_flags & SHF_L1_64 ) 69.290 + SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn); 69.291 +#endif 69.292 +} 69.293 + 69.294 +#define _FIXUP_IDX(_b, _i) ((_b) * SHADOW_OOS_FT_HASH + (_i)) 69.295 + 69.296 +void oos_fixup_add(struct vcpu *v, mfn_t gmfn, 69.297 + mfn_t smfn, unsigned long off) 69.298 +{ 69.299 + int idx, i, free = 0, free_slot = 0; 69.300 + struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups; 69.301 + 69.302 + idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH; 69.303 + for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ ) 69.304 + { 69.305 + if ( !mfn_valid(fixups[_FIXUP_IDX(idx, i)].gmfn) 69.306 + || !mfn_is_out_of_sync(fixups[_FIXUP_IDX(idx, i)].gmfn) ) 69.307 + { 69.308 + free = 1; 69.309 + free_slot = _FIXUP_IDX(idx, i); 69.310 + } 69.311 + else if ( (mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn)) 69.312 + && (mfn_x(fixups[_FIXUP_IDX(idx, i)].smfn) == mfn_x(smfn)) 69.313 + && (fixups[_FIXUP_IDX(idx, i)].off == off) ) 69.314 + { 69.315 + perfc_incr(shadow_oos_fixup_no_add); 69.316 + return; 69.317 + } 69.318 + } 69.319 + 69.320 + if ( free ) 69.321 + { 69.322 + if ( !v->arch.paging.shadow.oos_fixup_used ) 69.323 + v->arch.paging.shadow.oos_fixup_used = 1; 69.324 + fixups[free_slot].gmfn = gmfn; 69.325 + fixups[free_slot].smfn = smfn; 69.326 + fixups[free_slot].off = off; 69.327 + perfc_incr(shadow_oos_fixup_add_ok); 69.328 + return; 69.329 + } 69.330 + 69.331 + 69.332 + perfc_incr(shadow_oos_fixup_add_fail); 69.333 +} 69.334 + 69.335 +void oos_fixup_remove(struct vcpu *v, mfn_t gmfn) 69.336 +{ 69.337 + int idx, i; 69.338 + struct domain *d = v->domain; 69.339 + 69.340 + perfc_incr(shadow_oos_fixup_remove); 69.341 + 69.342 + /* If the domain is dying we might get called when deallocating 69.343 + * the shadows. Fixup tables are already freed so exit now. */ 69.344 + if ( d->is_dying ) 69.345 + return; 69.346 + 69.347 + idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH; 69.348 + for_each_vcpu(d, v) 69.349 + { 69.350 + struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups; 69.351 + for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ ) 69.352 + if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn) ) 69.353 + fixups[_FIXUP_IDX(idx, i)].gmfn = _mfn(INVALID_MFN); 69.354 + } 69.355 +} 69.356 + 69.357 +int oos_fixup_flush(struct vcpu *v) 69.358 +{ 69.359 + int i, rc = 0; 69.360 + struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups; 69.361 + 69.362 + perfc_incr(shadow_oos_fixup_flush); 69.363 + 69.364 + if ( !v->arch.paging.shadow.oos_fixup_used ) 69.365 + return 0; 69.366 + 69.367 + for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ ) 69.368 + { 69.369 + if ( mfn_valid(fixups[i].gmfn) ) 69.370 + { 69.371 + if ( mfn_is_out_of_sync(fixups[i].gmfn) ) 69.372 + rc |= sh_remove_write_access_from_sl1p(v, fixups[i].gmfn, 69.373 + fixups[i].smfn, 69.374 + fixups[i].off); 69.375 + fixups[i].gmfn = _mfn(INVALID_MFN); 69.376 + } 69.377 + } 69.378 + 69.379 + v->arch.paging.shadow.oos_fixup_used = 0; 69.380 + 69.381 + return rc; 69.382 +} 69.383 + 69.384 +int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn) 69.385 +{ 69.386 + int idx, i, rc = 0; 69.387 + struct domain *d = v->domain; 69.388 + 69.389 + perfc_incr(shadow_oos_fixup_flush_gmfn); 69.390 + 69.391 + idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH; 69.392 + for_each_vcpu(d, v) 69.393 + { 69.394 + struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups; 69.395 + 69.396 + for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ ) 69.397 + { 69.398 + if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) != mfn_x(gmfn) ) 69.399 + continue; 69.400 + 69.401 + rc |= sh_remove_write_access_from_sl1p(v, 69.402 + fixups[_FIXUP_IDX(idx,i)].gmfn, 69.403 + fixups[_FIXUP_IDX(idx,i)].smfn, 69.404 + fixups[_FIXUP_IDX(idx,i)].off); 69.405 + 69.406 + fixups[_FIXUP_IDX(idx,i)].gmfn = _mfn(INVALID_MFN); 69.407 + } 69.408 + } 69.409 + 69.410 + return rc; 69.411 +} 69.412 + 69.413 +static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned long va) 69.414 +{ 69.415 + int ftlb = 0; 69.416 + 69.417 + ftlb |= oos_fixup_flush_gmfn(v, gmfn); 69.418 + 69.419 + switch ( sh_remove_write_access(v, gmfn, 0, va) ) 69.420 + { 69.421 + default: 69.422 + case 0: 69.423 + break; 69.424 + 69.425 + case 1: 69.426 + ftlb |= 1; 69.427 + break; 69.428 + 69.429 + case -1: 69.430 + /* An unfindable writeable typecount has appeared, probably via a 69.431 + * grant table entry: can't shoot the mapping, so try to unshadow 69.432 + * the page. If that doesn't work either, the guest is granting 69.433 + * his pagetables and must be killed after all. 69.434 + * This will flush the tlb, so we can return with no worries. */ 69.435 + sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */); 69.436 + return 1; 69.437 + } 69.438 + 69.439 + if ( ftlb ) 69.440 + flush_tlb_mask(v->domain->domain_dirty_cpumask); 69.441 + 69.442 + return 0; 69.443 +} 69.444 + 69.445 + 69.446 +/* Pull all the entries on an out-of-sync page back into sync. */ 69.447 +static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va, mfn_t snp) 69.448 +{ 69.449 + struct page_info *pg = mfn_to_page(gmfn); 69.450 + 69.451 + ASSERT(shadow_locked_by_me(v->domain)); 69.452 + ASSERT(mfn_is_out_of_sync(gmfn)); 69.453 + /* Guest page must be shadowed *only* as L1 when out of sync. */ 69.454 + ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask 69.455 + & ~SHF_L1_ANY)); 69.456 + ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn))); 69.457 + 69.458 + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n", 69.459 + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va); 69.460 + 69.461 + /* Need to pull write access so the page *stays* in sync. */ 69.462 + if ( oos_remove_write_access(v, gmfn, va) ) 69.463 + { 69.464 + /* Page has been unshadowed. */ 69.465 + return; 69.466 + } 69.467 + 69.468 + /* No more writable mappings of this page, please */ 69.469 + pg->shadow_flags &= ~SHF_oos_may_write; 69.470 + 69.471 + /* Update the shadows with current guest entries. */ 69.472 + _sh_resync_l1(v, gmfn, snp); 69.473 + 69.474 + /* Now we know all the entries are synced, and will stay that way */ 69.475 + pg->shadow_flags &= ~SHF_out_of_sync; 69.476 + perfc_incr(shadow_resync); 69.477 +} 69.478 + 69.479 + 69.480 +/* Add an MFN to the list of out-of-sync guest pagetables */ 69.481 +static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va) 69.482 +{ 69.483 + int idx, oidx, swap = 0; 69.484 + void *gptr, *gsnpptr; 69.485 + mfn_t *oos = v->arch.paging.shadow.oos; 69.486 + unsigned long *oos_va = v->arch.paging.shadow.oos_va; 69.487 + mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; 69.488 + 69.489 + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; 69.490 + oidx = idx; 69.491 + 69.492 + if ( mfn_valid(oos[idx]) 69.493 + && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx ) 69.494 + { 69.495 + /* Punt the current occupant into the next slot */ 69.496 + SWAP(oos[idx], gmfn); 69.497 + SWAP(oos_va[idx], va); 69.498 + swap = 1; 69.499 + idx = (idx + 1) % SHADOW_OOS_PAGES; 69.500 + } 69.501 + if ( mfn_valid(oos[idx]) ) 69.502 + { 69.503 + /* Crush the current occupant. */ 69.504 + _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]); 69.505 + perfc_incr(shadow_unsync_evict); 69.506 + } 69.507 + oos[idx] = gmfn; 69.508 + oos_va[idx] = va; 69.509 + 69.510 + if ( swap ) 69.511 + SWAP(oos_snapshot[idx], oos_snapshot[oidx]); 69.512 + 69.513 + gptr = sh_map_domain_page(oos[oidx]); 69.514 + gsnpptr = sh_map_domain_page(oos_snapshot[oidx]); 69.515 + memcpy(gsnpptr, gptr, PAGE_SIZE); 69.516 + sh_unmap_domain_page(gptr); 69.517 + sh_unmap_domain_page(gsnpptr); 69.518 +} 69.519 + 69.520 +/* Remove an MFN from the list of out-of-sync guest pagetables */ 69.521 +static void oos_hash_remove(struct vcpu *v, mfn_t gmfn) 69.522 +{ 69.523 + int idx; 69.524 + mfn_t *oos; 69.525 + struct domain *d = v->domain; 69.526 + 69.527 + SHADOW_PRINTK("D%dV%d gmfn %lx\n", 69.528 + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); 69.529 + 69.530 + for_each_vcpu(d, v) 69.531 + { 69.532 + oos = v->arch.paging.shadow.oos; 69.533 + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; 69.534 + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) 69.535 + idx = (idx + 1) % SHADOW_OOS_PAGES; 69.536 + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) 69.537 + { 69.538 + oos[idx] = _mfn(INVALID_MFN); 69.539 + return; 69.540 + } 69.541 + } 69.542 + 69.543 + SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); 69.544 + BUG(); 69.545 +} 69.546 + 69.547 +mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn) 69.548 +{ 69.549 + int idx; 69.550 + mfn_t *oos; 69.551 + mfn_t *oos_snapshot; 69.552 + struct domain *d = v->domain; 69.553 + 69.554 + for_each_vcpu(d, v) 69.555 + { 69.556 + oos = v->arch.paging.shadow.oos; 69.557 + oos_snapshot = v->arch.paging.shadow.oos_snapshot; 69.558 + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; 69.559 + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) 69.560 + idx = (idx + 1) % SHADOW_OOS_PAGES; 69.561 + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) 69.562 + { 69.563 + return oos_snapshot[idx]; 69.564 + } 69.565 + } 69.566 + 69.567 + SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); 69.568 + BUG(); 69.569 + return _mfn(INVALID_MFN); 69.570 +} 69.571 + 69.572 +/* Pull a single guest page back into sync */ 69.573 +void sh_resync(struct vcpu *v, mfn_t gmfn) 69.574 +{ 69.575 + int idx; 69.576 + mfn_t *oos; 69.577 + unsigned long *oos_va; 69.578 + mfn_t *oos_snapshot; 69.579 + struct domain *d = v->domain; 69.580 + 69.581 + for_each_vcpu(d, v) 69.582 + { 69.583 + oos = v->arch.paging.shadow.oos; 69.584 + oos_va = v->arch.paging.shadow.oos_va; 69.585 + oos_snapshot = v->arch.paging.shadow.oos_snapshot; 69.586 + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; 69.587 + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) 69.588 + idx = (idx + 1) % SHADOW_OOS_PAGES; 69.589 + 69.590 + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) 69.591 + { 69.592 + _sh_resync(v, gmfn, oos_va[idx], oos_snapshot[idx]); 69.593 + oos[idx] = _mfn(INVALID_MFN); 69.594 + return; 69.595 + } 69.596 + } 69.597 + 69.598 + SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); 69.599 + BUG(); 69.600 +} 69.601 + 69.602 +/* Figure out whether it's definitely safe not to sync this l1 table, 69.603 + * by making a call out to the mode in which that shadow was made. */ 69.604 +static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn) 69.605 +{ 69.606 + struct page_info *pg = mfn_to_page(gl1mfn); 69.607 + if ( pg->shadow_flags & SHF_L1_32 ) 69.608 + return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn); 69.609 + else if ( pg->shadow_flags & SHF_L1_PAE ) 69.610 + return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn); 69.611 +#if CONFIG_PAGING_LEVELS >= 4 69.612 + else if ( pg->shadow_flags & SHF_L1_64 ) 69.613 + return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn); 69.614 +#endif 69.615 + SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n", 69.616 + mfn_x(gl1mfn)); 69.617 + BUG(); 69.618 + return 0; /* BUG() is no longer __attribute__((noreturn)). */ 69.619 +} 69.620 + 69.621 + 69.622 +/* Pull all out-of-sync pages back into sync. Pages brought out of sync 69.623 + * on other vcpus are allowed to remain out of sync, but their contents 69.624 + * will be made safe (TLB flush semantics); pages unsynced by this vcpu 69.625 + * are brought back into sync and write-protected. If skip != 0, we try 69.626 + * to avoid resyncing at all if we think we can get away with it. */ 69.627 +void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking) 69.628 +{ 69.629 + int idx; 69.630 + struct vcpu *other; 69.631 + mfn_t *oos = v->arch.paging.shadow.oos; 69.632 + unsigned long *oos_va = v->arch.paging.shadow.oos_va; 69.633 + mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; 69.634 + 69.635 + SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id); 69.636 + 69.637 + ASSERT(do_locking || shadow_locked_by_me(v->domain)); 69.638 + 69.639 + if ( !this ) 69.640 + goto resync_others; 69.641 + 69.642 + if ( do_locking ) 69.643 + shadow_lock(v->domain); 69.644 + 69.645 + if ( oos_fixup_flush(v) ) 69.646 + flush_tlb_mask(v->domain->domain_dirty_cpumask); 69.647 + 69.648 + /* First: resync all of this vcpu's oos pages */ 69.649 + for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 69.650 + if ( mfn_valid(oos[idx]) ) 69.651 + { 69.652 + /* Write-protect and sync contents */ 69.653 + _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]); 69.654 + oos[idx] = _mfn(INVALID_MFN); 69.655 + } 69.656 + 69.657 + if ( do_locking ) 69.658 + shadow_unlock(v->domain); 69.659 + 69.660 + resync_others: 69.661 + if ( !others ) 69.662 + return; 69.663 + 69.664 + /* Second: make all *other* vcpus' oos pages safe. */ 69.665 + for_each_vcpu(v->domain, other) 69.666 + { 69.667 + if ( v == other ) 69.668 + continue; 69.669 + 69.670 + if ( do_locking ) 69.671 + shadow_lock(v->domain); 69.672 + 69.673 + oos = other->arch.paging.shadow.oos; 69.674 + oos_va = other->arch.paging.shadow.oos_va; 69.675 + oos_snapshot = other->arch.paging.shadow.oos_snapshot; 69.676 + for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 69.677 + { 69.678 + if ( !mfn_valid(oos[idx]) ) 69.679 + continue; 69.680 + 69.681 + if ( skip ) 69.682 + { 69.683 + /* Update the shadows and leave the page OOS. */ 69.684 + if ( sh_skip_sync(v, oos[idx]) ) 69.685 + continue; 69.686 + _sh_resync_l1(other, oos[idx], oos_snapshot[idx]); 69.687 + } 69.688 + else 69.689 + { 69.690 + /* Write-protect and sync contents */ 69.691 + _sh_resync(other, oos[idx], oos_va[idx], oos_snapshot[idx]); 69.692 + oos[idx] = _mfn(INVALID_MFN); 69.693 + } 69.694 + } 69.695 + 69.696 + if ( do_locking ) 69.697 + shadow_unlock(v->domain); 69.698 + } 69.699 +} 69.700 + 69.701 +/* Allow a shadowed page to go out of sync */ 69.702 +int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va) 69.703 +{ 69.704 + struct page_info *pg; 69.705 + 69.706 + ASSERT(shadow_locked_by_me(v->domain)); 69.707 + 69.708 + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n", 69.709 + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va); 69.710 + 69.711 + pg = mfn_to_page(gmfn); 69.712 + 69.713 + /* Guest page must be shadowed *only* as L1 and *only* once when out 69.714 + * of sync. Also, get out now if it's already out of sync. 69.715 + * Also, can't safely unsync if some vcpus have paging disabled.*/ 69.716 + if ( pg->shadow_flags & 69.717 + ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) 69.718 + || sh_page_has_multiple_shadows(pg) 69.719 + || !is_hvm_domain(v->domain) 69.720 + || !v->domain->arch.paging.shadow.oos_active ) 69.721 + return 0; 69.722 + 69.723 + pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write; 69.724 + oos_hash_add(v, gmfn, va); 69.725 + perfc_incr(shadow_unsync); 69.726 + return 1; 69.727 +} 69.728 + 69.729 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ 69.730 + 69.731 69.732 /**************************************************************************/ 69.733 /* Code for "promoting" a guest page to the point where the shadow code is 69.734 @@ -440,6 +1029,12 @@ void shadow_promote(struct vcpu *v, mfn_ 69.735 69.736 ASSERT(mfn_valid(gmfn)); 69.737 69.738 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.739 + /* Is the page already shadowed and out of sync? */ 69.740 + if ( page_is_out_of_sync(page) ) 69.741 + sh_resync(v, gmfn); 69.742 +#endif 69.743 + 69.744 /* We should never try to promote a gmfn that has writeable mappings */ 69.745 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page 69.746 || (page->u.inuse.type_info & PGT_count_mask) == 0 69.747 @@ -463,7 +1058,17 @@ void shadow_demote(struct vcpu *v, mfn_t 69.748 clear_bit(type, &page->shadow_flags); 69.749 69.750 if ( (page->shadow_flags & SHF_page_type_mask) == 0 ) 69.751 + { 69.752 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.753 + /* Was the page out of sync? */ 69.754 + if ( page_is_out_of_sync(page) ) 69.755 + { 69.756 + oos_hash_remove(v, gmfn); 69.757 + oos_fixup_remove(v, gmfn); 69.758 + } 69.759 +#endif 69.760 clear_bit(_PGC_page_table, &page->count_info); 69.761 + } 69.762 } 69.763 69.764 /**************************************************************************/ 69.765 @@ -674,7 +1279,8 @@ shadow_order(unsigned int shadow_type) 69.766 0, /* SH_type_l3_64_shadow */ 69.767 0, /* SH_type_l4_64_shadow */ 69.768 2, /* SH_type_p2m_table */ 69.769 - 0 /* SH_type_monitor_table */ 69.770 + 0, /* SH_type_monitor_table */ 69.771 + 0 /* SH_type_oos_snapshot */ 69.772 }; 69.773 ASSERT(shadow_type < SH_type_unused); 69.774 return type_to_order[shadow_type]; 69.775 @@ -1220,6 +1826,14 @@ static unsigned int sh_set_allocation(st 69.776 sp = list_entry(d->arch.paging.shadow.freelists[order].next, 69.777 struct shadow_page_info, list); 69.778 list_del(&sp->list); 69.779 +#if defined(__x86_64__) 69.780 + /* 69.781 + * Re-instate lock field which we overwrite with shadow_page_info. 69.782 + * This was safe, since the lock is only used on guest pages. 69.783 + */ 69.784 + for ( j = 0; j < 1U << order; j++ ) 69.785 + spin_lock_init(&((struct page_info *)sp)[j].lock); 69.786 +#endif 69.787 d->arch.paging.shadow.free_pages -= 1 << order; 69.788 d->arch.paging.shadow.total_pages -= 1 << order; 69.789 free_domheap_pages((struct page_info *)sp, order); 69.790 @@ -1297,6 +1911,27 @@ static void sh_hash_audit_bucket(struct 69.791 /* Bad shadow flags on guest page? */ 69.792 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) ); 69.793 /* Bad type count on guest page? */ 69.794 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.795 + if ( sp->type == SH_type_l1_32_shadow 69.796 + || sp->type == SH_type_l1_pae_shadow 69.797 + || sp->type == SH_type_l1_64_shadow ) 69.798 + { 69.799 + if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 69.800 + && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) 69.801 + { 69.802 + if ( !page_is_out_of_sync(gpg) ) 69.803 + { 69.804 + SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")" 69.805 + " and not OOS but has typecount %#lx\n", 69.806 + sp->backpointer, 69.807 + mfn_x(shadow_page_to_mfn(sp)), 69.808 + gpg->u.inuse.type_info); 69.809 + BUG(); 69.810 + } 69.811 + } 69.812 + } 69.813 + else /* Not an l1 */ 69.814 +#endif 69.815 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 69.816 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) 69.817 { 69.818 @@ -1608,7 +2243,8 @@ void sh_destroy_shadow(struct vcpu *v, m 69.819 /* Remove all writeable mappings of a guest frame from the shadow tables 69.820 * Returns non-zero if we need to flush TLBs. 69.821 * level and fault_addr desribe how we found this to be a pagetable; 69.822 - * level==0 means we have some other reason for revoking write access.*/ 69.823 + * level==0 means we have some other reason for revoking write access. 69.824 + * If level==0 we are allowed to fail, returning -1. */ 69.825 69.826 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, 69.827 unsigned int level, 69.828 @@ -1659,7 +2295,12 @@ int sh_remove_write_access(struct vcpu * 69.829 return 0; 69.830 69.831 /* Early exit if it's already a pagetable, or otherwise not writeable */ 69.832 - if ( sh_mfn_is_a_page_table(gmfn) 69.833 + if ( (sh_mfn_is_a_page_table(gmfn) 69.834 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.835 + /* Unless they've been allowed to go out of sync with their shadows */ 69.836 + && !mfn_oos_may_write(gmfn) 69.837 +#endif 69.838 + ) 69.839 || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) 69.840 return 0; 69.841 69.842 @@ -1676,7 +2317,7 @@ int sh_remove_write_access(struct vcpu * 69.843 } 69.844 69.845 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC 69.846 - if ( v == current && level != 0 ) 69.847 + if ( v == current ) 69.848 { 69.849 unsigned long gfn; 69.850 /* Heuristic: there is likely to be only one writeable mapping, 69.851 @@ -1690,6 +2331,8 @@ int sh_remove_write_access(struct vcpu * 69.852 return 1; \ 69.853 } while (0) 69.854 69.855 + if ( level == 0 && fault_addr ) 69.856 + GUESS(fault_addr, 6); 69.857 69.858 if ( v->arch.paging.mode->guest_levels == 2 ) 69.859 { 69.860 @@ -1773,13 +2416,19 @@ int sh_remove_write_access(struct vcpu * 69.861 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */ 69.862 69.863 /* Brute-force search of all the shadows, by walking the hash */ 69.864 - perfc_incr(shadow_writeable_bf); 69.865 + if ( level == 0 ) 69.866 + perfc_incr(shadow_writeable_bf_1); 69.867 + else 69.868 + perfc_incr(shadow_writeable_bf); 69.869 hash_foreach(v, callback_mask, callbacks, gmfn); 69.870 69.871 /* If that didn't catch the mapping, then there's some non-pagetable 69.872 * mapping -- ioreq page, grant mapping, &c. */ 69.873 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) 69.874 { 69.875 + if ( level == 0 ) 69.876 + return -1; 69.877 + 69.878 SHADOW_ERROR("can't remove write access to mfn %lx: guest has " 69.879 "%lu special-use mappings of it\n", mfn_x(gmfn), 69.880 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); 69.881 @@ -1790,7 +2439,34 @@ int sh_remove_write_access(struct vcpu * 69.882 return 1; 69.883 } 69.884 69.885 - 69.886 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.887 +int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, 69.888 + mfn_t smfn, unsigned long off) 69.889 +{ 69.890 + struct shadow_page_info *sp = mfn_to_shadow_page(smfn); 69.891 + 69.892 + ASSERT(mfn_valid(smfn)); 69.893 + ASSERT(mfn_valid(gmfn)); 69.894 + 69.895 + if ( sp->type == SH_type_l1_32_shadow ) 69.896 + { 69.897 + return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2) 69.898 + (v, gmfn, smfn, off); 69.899 + } 69.900 +#if CONFIG_PAGING_LEVELS >= 3 69.901 + else if ( sp->type == SH_type_l1_pae_shadow ) 69.902 + return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3) 69.903 + (v, gmfn, smfn, off); 69.904 +#if CONFIG_PAGING_LEVELS >= 4 69.905 + else if ( sp->type == SH_type_l1_64_shadow ) 69.906 + return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4) 69.907 + (v, gmfn, smfn, off); 69.908 +#endif 69.909 +#endif 69.910 + 69.911 + return 0; 69.912 +} 69.913 +#endif 69.914 69.915 /**************************************************************************/ 69.916 /* Remove all mappings of a guest frame from the shadow tables. 69.917 @@ -2127,6 +2803,36 @@ static void sh_update_paging_modes(struc 69.918 } 69.919 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ 69.920 69.921 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.922 + if ( v->arch.paging.shadow.oos_fixups == NULL ) 69.923 + { 69.924 + int i; 69.925 + v->arch.paging.shadow.oos_fixups = 69.926 + alloc_xenheap_pages(SHADOW_OOS_FT_ORDER); 69.927 + if ( v->arch.paging.shadow.oos_fixups == NULL ) 69.928 + { 69.929 + SHADOW_ERROR("Could not allocate OOS fixup table" 69.930 + " for dom %u vcpu %u\n", 69.931 + v->domain->domain_id, v->vcpu_id); 69.932 + domain_crash(v->domain); 69.933 + return; 69.934 + } 69.935 + for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ ) 69.936 + v->arch.paging.shadow.oos_fixups[i].gmfn = _mfn(INVALID_MFN); 69.937 + } 69.938 + 69.939 + if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN ) 69.940 + { 69.941 + int i; 69.942 + for(i = 0; i < SHADOW_OOS_PAGES; i++) 69.943 + { 69.944 + shadow_prealloc(d, SH_type_oos_snapshot, 1); 69.945 + v->arch.paging.shadow.oos_snapshot[i] = 69.946 + shadow_alloc(d, SH_type_oos_snapshot, 0); 69.947 + } 69.948 + } 69.949 +#endif /* OOS */ 69.950 + 69.951 // Valid transitions handled by this function: 69.952 // - For PV guests: 69.953 // - after a shadow mode has been changed 69.954 @@ -2159,6 +2865,13 @@ static void sh_update_paging_modes(struc 69.955 ASSERT(shadow_mode_translate(d)); 69.956 ASSERT(shadow_mode_external(d)); 69.957 69.958 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.959 + /* Need to resync all our pages now, because if a page goes out 69.960 + * of sync with paging enabled and is resynced with paging 69.961 + * disabled, the resync will go wrong. */ 69.962 + shadow_resync_all(v, 0); 69.963 +#endif /* OOS */ 69.964 + 69.965 if ( !hvm_paging_enabled(v) ) 69.966 { 69.967 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE 69.968 @@ -2254,6 +2967,27 @@ static void sh_update_paging_modes(struc 69.969 // This *does* happen, at least for CR4.PGE... 69.970 } 69.971 69.972 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.973 + /* We need to check that all the vcpus have paging enabled to 69.974 + * unsync PTs. */ 69.975 + if ( is_hvm_domain(d) ) 69.976 + { 69.977 + int pe = 1; 69.978 + struct vcpu *vptr; 69.979 + 69.980 + for_each_vcpu(d, vptr) 69.981 + { 69.982 + if ( !hvm_paging_enabled(vptr) ) 69.983 + { 69.984 + pe = 0; 69.985 + break; 69.986 + } 69.987 + } 69.988 + 69.989 + d->arch.paging.shadow.oos_active = pe; 69.990 + } 69.991 +#endif /* OOS */ 69.992 + 69.993 v->arch.paging.mode->update_cr3(v, 0); 69.994 } 69.995 69.996 @@ -2426,17 +3160,36 @@ void shadow_teardown(struct domain *d) 69.997 } 69.998 } 69.999 69.1000 -#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) 69.1001 +#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) 69.1002 /* Free the virtual-TLB array attached to each vcpu */ 69.1003 for_each_vcpu(d, v) 69.1004 { 69.1005 +#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) 69.1006 if ( v->arch.paging.vtlb ) 69.1007 { 69.1008 xfree(v->arch.paging.vtlb); 69.1009 v->arch.paging.vtlb = NULL; 69.1010 } 69.1011 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ 69.1012 + 69.1013 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.1014 + if ( v->arch.paging.shadow.oos_fixups ) 69.1015 + { 69.1016 + free_xenheap_pages(v->arch.paging.shadow.oos_fixups, 69.1017 + SHADOW_OOS_FT_ORDER); 69.1018 + v->arch.paging.shadow.oos_fixups = NULL; 69.1019 + } 69.1020 + 69.1021 + { 69.1022 + int i; 69.1023 + mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; 69.1024 + for(i = 0; i < SHADOW_OOS_PAGES; i++) 69.1025 + if ( mfn_valid(oos_snapshot[i]) ) 69.1026 + shadow_free(d, oos_snapshot[i]); 69.1027 + } 69.1028 +#endif /* OOS */ 69.1029 } 69.1030 -#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ 69.1031 +#endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */ 69.1032 69.1033 list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist) 69.1034 { 69.1035 @@ -3044,7 +3797,11 @@ void shadow_audit_tables(struct vcpu *v) 69.1036 69.1037 if ( !(SHADOW_AUDIT_ENABLE) ) 69.1038 return; 69.1039 - 69.1040 + 69.1041 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 69.1042 + sh_oos_audit(v->domain); 69.1043 +#endif 69.1044 + 69.1045 if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL ) 69.1046 mask = ~1; /* Audit every table in the system */ 69.1047 else
70.1 --- a/xen/arch/x86/mm/shadow/multi.c Thu Jun 19 12:48:04 2008 +0900 70.2 +++ b/xen/arch/x86/mm/shadow/multi.c Wed Jul 02 11:30:37 2008 +0900 70.3 @@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig 70.4 } 70.5 70.6 /* Remove write access permissions from a gwalk_t in a batch, and 70.7 - * return OR-ed result for TLB flush hint 70.8 + * return OR-ed result for TLB flush hint and need to rewalk the guest 70.9 + * pages. 70.10 + * 70.11 + * Syncing pages will remove write access to that page; but it may 70.12 + * also give write access to other pages in the path. If we resync any 70.13 + * pages, re-walk from the beginning. 70.14 */ 70.15 +#define GW_RMWR_FLUSHTLB 1 70.16 +#define GW_RMWR_REWALK 2 70.17 + 70.18 static inline uint32_t 70.19 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw) 70.20 { 70.21 - int rc = 0; 70.22 + uint32_t rc = 0; 70.23 70.24 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ 70.25 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ 70.26 - rc = sh_remove_write_access(v, gw->l3mfn, 3, va); 70.27 -#endif 70.28 - rc |= sh_remove_write_access(v, gw->l2mfn, 2, va); 70.29 -#endif 70.30 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.31 + if ( mfn_is_out_of_sync(gw->l3mfn) ) 70.32 + { 70.33 + sh_resync(v, gw->l3mfn); 70.34 + rc = GW_RMWR_REWALK; 70.35 + } 70.36 + else 70.37 +#endif /* OOS */ 70.38 + if ( sh_remove_write_access(v, gw->l3mfn, 3, va) ) 70.39 + rc = GW_RMWR_FLUSHTLB; 70.40 +#endif /* GUEST_PAGING_LEVELS >= 4 */ 70.41 + 70.42 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.43 + if ( mfn_is_out_of_sync(gw->l2mfn) ) 70.44 + { 70.45 + sh_resync(v, gw->l2mfn); 70.46 + rc |= GW_RMWR_REWALK; 70.47 + } 70.48 + else 70.49 +#endif /* OOS */ 70.50 + if ( sh_remove_write_access(v, gw->l2mfn, 2, va) ) 70.51 + rc |= GW_RMWR_FLUSHTLB; 70.52 +#endif /* GUEST_PAGING_LEVELS >= 3 */ 70.53 + 70.54 if ( !(guest_supports_superpages(v) && 70.55 - (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) ) 70.56 - rc |= sh_remove_write_access(v, gw->l1mfn, 1, va); 70.57 + (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) 70.58 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.59 + && !mfn_is_out_of_sync(gw->l1mfn) 70.60 +#endif /* OOS */ 70.61 + && sh_remove_write_access(v, gw->l1mfn, 1, va) ) 70.62 + rc |= GW_RMWR_FLUSHTLB; 70.63 70.64 return rc; 70.65 } 70.66 @@ -882,7 +914,12 @@ static always_inline void 70.67 70.68 // protect guest page tables 70.69 // 70.70 - if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) ) 70.71 + if ( unlikely((level == 1) 70.72 + && sh_mfn_is_a_page_table(target_mfn) 70.73 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) 70.74 + && !mfn_oos_may_write(target_mfn) 70.75 +#endif /* OOS */ 70.76 + ) ) 70.77 { 70.78 if ( shadow_mode_trap_reads(d) ) 70.79 { 70.80 @@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v 70.81 domain_crash(v->domain); 70.82 return SHADOW_SET_ERROR; 70.83 } 70.84 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) 70.85 + shadow_resync_all(v, 0); 70.86 +#endif 70.87 } 70.88 70.89 /* Write the new entry */ 70.90 @@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v 70.91 | (((unsigned long)sl3e) & ~PAGE_MASK)); 70.92 70.93 if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) 70.94 + { 70.95 /* About to install a new reference */ 70.96 if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) ) 70.97 { 70.98 domain_crash(v->domain); 70.99 return SHADOW_SET_ERROR; 70.100 - } 70.101 + } 70.102 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) 70.103 + shadow_resync_all(v, 0); 70.104 +#endif 70.105 + } 70.106 70.107 /* Write the new entry */ 70.108 shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn); 70.109 @@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v 70.110 | (((unsigned long)sl2e) & ~PAGE_MASK)); 70.111 70.112 if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 70.113 + { 70.114 + mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e); 70.115 + 70.116 /* About to install a new reference */ 70.117 - if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) ) 70.118 + if ( !sh_get_ref(v, sl1mfn, paddr) ) 70.119 { 70.120 domain_crash(v->domain); 70.121 return SHADOW_SET_ERROR; 70.122 - } 70.123 + } 70.124 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.125 + { 70.126 + struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn); 70.127 + mfn_t gl1mfn = _mfn(sp->backpointer); 70.128 + 70.129 + /* If the shadow is a fl1 then the backpointer contains 70.130 + the GFN instead of the GMFN, and it's definitely not 70.131 + OOS. */ 70.132 + if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn) 70.133 + && mfn_is_out_of_sync(gl1mfn) ) 70.134 + sh_resync(v, gl1mfn); 70.135 + } 70.136 +#endif 70.137 + } 70.138 70.139 /* Write the new entry */ 70.140 #if GUEST_PAGING_LEVELS == 2 70.141 @@ -1347,6 +1409,9 @@ static int shadow_set_l1e(struct vcpu *v 70.142 int flags = 0; 70.143 struct domain *d = v->domain; 70.144 shadow_l1e_t old_sl1e; 70.145 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC 70.146 + mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e); 70.147 +#endif 70.148 ASSERT(sl1e != NULL); 70.149 70.150 old_sl1e = *sl1e; 70.151 @@ -1363,8 +1428,18 @@ static int shadow_set_l1e(struct vcpu *v 70.152 /* Doesn't look like a pagetable. */ 70.153 flags |= SHADOW_SET_ERROR; 70.154 new_sl1e = shadow_l1e_empty(); 70.155 - } else { 70.156 + } 70.157 + else 70.158 + { 70.159 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d); 70.160 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC 70.161 + if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn) 70.162 + && (shadow_l1e_get_flags(new_sl1e) & _PAGE_RW) ) 70.163 + { 70.164 + oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e)); 70.165 + } 70.166 +#endif 70.167 + 70.168 } 70.169 } 70.170 } 70.171 @@ -2532,6 +2607,9 @@ static int validate_gl1e(struct vcpu *v, 70.172 mfn_t gmfn; 70.173 p2m_type_t p2mt; 70.174 int result = 0; 70.175 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.176 + mfn_t gl1mfn; 70.177 +#endif /* OOS */ 70.178 70.179 perfc_incr(shadow_validate_gl1e_calls); 70.180 70.181 @@ -2539,11 +2617,139 @@ static int validate_gl1e(struct vcpu *v, 70.182 gmfn = gfn_to_mfn(v->domain, gfn, &p2mt); 70.183 70.184 l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt); 70.185 - 70.186 result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn); 70.187 + 70.188 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.189 + gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); 70.190 + if ( mfn_valid(gl1mfn) 70.191 + && mfn_is_out_of_sync(gl1mfn) ) 70.192 + { 70.193 + /* Update the OOS snapshot. */ 70.194 + mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn); 70.195 + guest_l1e_t *snp; 70.196 + 70.197 + ASSERT(mfn_valid(snpmfn)); 70.198 + 70.199 + snp = sh_map_domain_page(snpmfn); 70.200 + snp[guest_index(new_ge)] = new_gl1e; 70.201 + sh_unmap_domain_page(snp); 70.202 + } 70.203 +#endif /* OOS */ 70.204 + 70.205 return result; 70.206 } 70.207 70.208 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.209 +/**************************************************************************/ 70.210 +/* Special validation function for re-syncing out-of-sync shadows. 70.211 + * Walks the *shadow* page, and for every entry that it finds, 70.212 + * revalidates the guest entry that corresponds to it. 70.213 + * N.B. This function is called with the vcpu that unsynced the page, 70.214 + * *not* the one that is causing it to be resynced. */ 70.215 +void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn) 70.216 +{ 70.217 + mfn_t sl1mfn; 70.218 + shadow_l1e_t *sl1p; 70.219 + guest_l1e_t *gl1p, *gp, *snp; 70.220 + int rc = 0; 70.221 + 70.222 + ASSERT(mfn_valid(snpmfn)); 70.223 + 70.224 + sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); 70.225 + ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */ 70.226 + 70.227 + snp = sh_map_domain_page(snpmfn); 70.228 + gp = sh_map_domain_page(gl1mfn); 70.229 + gl1p = gp; 70.230 + 70.231 + SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, { 70.232 + guest_l1e_t gl1e = *gl1p; 70.233 + guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p); 70.234 + 70.235 + if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) ) 70.236 + { 70.237 + gfn_t gfn; 70.238 + mfn_t gmfn; 70.239 + p2m_type_t p2mt; 70.240 + shadow_l1e_t nsl1e; 70.241 + 70.242 + gfn = guest_l1e_get_gfn(gl1e); 70.243 + gmfn = gfn_to_mfn(v->domain, gfn, &p2mt); 70.244 + l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt); 70.245 + rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn); 70.246 + 70.247 + *snpl1p = gl1e; 70.248 + } 70.249 + }); 70.250 + 70.251 + sh_unmap_domain_page(gp); 70.252 + sh_unmap_domain_page(snp); 70.253 + 70.254 + /* Setting shadow L1 entries should never need us to flush the TLB */ 70.255 + ASSERT(!(rc & SHADOW_SET_FLUSH)); 70.256 +} 70.257 + 70.258 +/* Figure out whether it's definitely safe not to sync this l1 table. 70.259 + * That is: if we can tell that it's only used once, and that the 70.260 + * toplevel shadow responsible is not one of ours. 70.261 + * N.B. This function is called with the vcpu that required the resync, 70.262 + * *not* the one that originally unsynced the page, but it is 70.263 + * called in the *mode* of the vcpu that unsynced it. Clear? Good. */ 70.264 +int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn) 70.265 +{ 70.266 + struct shadow_page_info *sp; 70.267 + mfn_t smfn; 70.268 + 70.269 + smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); 70.270 + ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */ 70.271 + 70.272 + /* Up to l2 */ 70.273 + sp = mfn_to_shadow_page(smfn); 70.274 + if ( sp->count != 1 || !sp->up ) 70.275 + return 0; 70.276 + smfn = _mfn(sp->up >> PAGE_SHIFT); 70.277 + ASSERT(mfn_valid(smfn)); 70.278 + 70.279 +#if (SHADOW_PAGING_LEVELS == 4) 70.280 + /* up to l3 */ 70.281 + sp = mfn_to_shadow_page(smfn); 70.282 + if ( sp->count != 1 || !sp->up ) 70.283 + return 0; 70.284 + smfn = _mfn(sp->up >> PAGE_SHIFT); 70.285 + ASSERT(mfn_valid(smfn)); 70.286 + 70.287 + /* up to l4 */ 70.288 + sp = mfn_to_shadow_page(smfn); 70.289 + if ( sp->count != 1 70.290 + || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up ) 70.291 + return 0; 70.292 + smfn = _mfn(sp->up >> PAGE_SHIFT); 70.293 + ASSERT(mfn_valid(smfn)); 70.294 + 70.295 +#if (GUEST_PAGING_LEVELS == 2) 70.296 + /* In 2-on-3 shadow mode the up pointer contains the link to the 70.297 + * shadow page, but the shadow_table contains only the first of the 70.298 + * four pages that makes the PAE top shadow tables. */ 70.299 + smfn = _mfn(mfn_x(smfn) & ~0x3UL); 70.300 +#endif 70.301 + 70.302 +#endif 70.303 + 70.304 + if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn) 70.305 +#if (SHADOW_PAGING_LEVELS == 3) 70.306 + || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn) 70.307 + || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn) 70.308 + || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) 70.309 +#endif 70.310 + ) 70.311 + return 0; 70.312 + 70.313 + /* Only in use in one toplevel shadow, and it's not the one we're 70.314 + * running on */ 70.315 + return 1; 70.316 +} 70.317 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ 70.318 + 70.319 70.320 /**************************************************************************/ 70.321 /* Functions which translate and install the shadows of arbitrary guest 70.322 @@ -2725,6 +2931,10 @@ static void sh_prefetch(struct vcpu *v, 70.323 shadow_l1e_t sl1e; 70.324 u32 gflags; 70.325 p2m_type_t p2mt; 70.326 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.327 + guest_l1e_t *snpl1p = NULL; 70.328 +#endif /* OOS */ 70.329 + 70.330 70.331 /* Prefetch no further than the end of the _shadow_ l1 MFN */ 70.332 dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e; 70.333 @@ -2737,6 +2947,17 @@ static void sh_prefetch(struct vcpu *v, 70.334 /* Normal guest page; grab the next guest entry */ 70.335 gl1p = sh_map_domain_page(gw->l1mfn); 70.336 gl1p += guest_l1_table_offset(gw->va); 70.337 + 70.338 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.339 + if ( mfn_is_out_of_sync(gw->l1mfn) ) 70.340 + { 70.341 + mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn); 70.342 + 70.343 + ASSERT(mfn_valid(snpmfn)); 70.344 + snpl1p = sh_map_domain_page(snpmfn); 70.345 + snpl1p += guest_l1_table_offset(gw->va); 70.346 + } 70.347 +#endif /* OOS */ 70.348 } 70.349 70.350 for ( i = 1; i < dist ; i++ ) 70.351 @@ -2774,9 +2995,18 @@ static void sh_prefetch(struct vcpu *v, 70.352 /* Propagate the entry. */ 70.353 l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt); 70.354 (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn); 70.355 + 70.356 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.357 + if ( snpl1p != NULL ) 70.358 + snpl1p[i] = gl1e; 70.359 +#endif /* OOS */ 70.360 } 70.361 if ( gl1p != NULL ) 70.362 sh_unmap_domain_page(gl1p); 70.363 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.364 + if ( snpl1p != NULL ) 70.365 + sh_unmap_domain_page(snpl1p); 70.366 +#endif /* OOS */ 70.367 } 70.368 70.369 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */ 70.370 @@ -2805,6 +3035,7 @@ static int sh_page_fault(struct vcpu *v, 70.371 int r; 70.372 fetch_type_t ft = 0; 70.373 p2m_type_t p2mt; 70.374 + uint32_t rc; 70.375 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION 70.376 int fast_emul = 0; 70.377 #endif 70.378 @@ -2830,6 +3061,17 @@ static int sh_page_fault(struct vcpu *v, 70.379 { 70.380 fast_emul = 1; 70.381 gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn); 70.382 + 70.383 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.384 + /* Fall back to the slow path if we're trying to emulate 70.385 + writes to an out of sync page. */ 70.386 + if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) ) 70.387 + { 70.388 + v->arch.paging.last_write_emul_ok = 0; 70.389 + goto page_fault_slow_path; 70.390 + } 70.391 +#endif /* OOS */ 70.392 + 70.393 perfc_incr(shadow_fault_fast_emulate); 70.394 goto early_emulation; 70.395 } 70.396 @@ -2855,6 +3097,31 @@ static int sh_page_fault(struct vcpu *v, 70.397 sizeof(sl1e)) == 0) 70.398 && sh_l1e_is_magic(sl1e)) ) 70.399 { 70.400 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.401 + /* First, need to check that this isn't an out-of-sync 70.402 + * shadow l1e. If it is, we fall back to the slow path, which 70.403 + * will sync it up again. */ 70.404 + { 70.405 + shadow_l2e_t sl2e; 70.406 + mfn_t gl1mfn; 70.407 + if ( (__copy_from_user(&sl2e, 70.408 + (sh_linear_l2_table(v) 70.409 + + shadow_l2_linear_offset(va)), 70.410 + sizeof(sl2e)) != 0) 70.411 + || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) 70.412 + || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page( 70.413 + shadow_l2e_get_mfn(sl2e))->backpointer)) 70.414 + || unlikely(mfn_is_out_of_sync(gl1mfn)) ) 70.415 + { 70.416 + /* Hit the slow path as if there had been no 70.417 + * shadow entry at all, and let it tidy up */ 70.418 + ASSERT(regs->error_code & PFEC_page_present); 70.419 + regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present); 70.420 + goto page_fault_slow_path; 70.421 + } 70.422 + } 70.423 +#endif /* SHOPT_OUT_OF_SYNC */ 70.424 + 70.425 if ( sh_l1e_is_gnp(sl1e) ) 70.426 { 70.427 /* Not-present in a guest PT: pass to the guest as 70.428 @@ -2890,6 +3157,10 @@ static int sh_page_fault(struct vcpu *v, 70.429 return EXCRET_fault_fixed; 70.430 } 70.431 } 70.432 + 70.433 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.434 + page_fault_slow_path: 70.435 +#endif 70.436 #endif /* SHOPT_FAST_FAULT_PATH */ 70.437 70.438 /* Detect if this page fault happened while we were already in Xen 70.439 @@ -2904,7 +3175,21 @@ static int sh_page_fault(struct vcpu *v, 70.440 return 0; 70.441 } 70.442 70.443 - if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 ) 70.444 + rewalk: 70.445 + rc = guest_walk_tables(v, va, &gw, regs->error_code); 70.446 + 70.447 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.448 + if ( !(rc & _PAGE_PRESENT) ) 70.449 + regs->error_code |= PFEC_page_present; 70.450 + else if ( regs->error_code & PFEC_page_present ) 70.451 + { 70.452 + SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB" 70.453 + " flushing. Have fun debugging it.\n"); 70.454 + regs->error_code &= ~PFEC_page_present; 70.455 + } 70.456 +#endif 70.457 + 70.458 + if ( rc != 0 ) 70.459 { 70.460 perfc_incr(shadow_fault_bail_real_fault); 70.461 SHADOW_PRINTK("not a shadow fault\n"); 70.462 @@ -2948,7 +3233,10 @@ static int sh_page_fault(struct vcpu *v, 70.463 70.464 shadow_lock(d); 70.465 70.466 - if ( gw_remove_write_accesses(v, va, &gw) ) 70.467 + rc = gw_remove_write_accesses(v, va, &gw); 70.468 + 70.469 + /* First bit set: Removed write access to a page. */ 70.470 + if ( rc & GW_RMWR_FLUSHTLB ) 70.471 { 70.472 /* Write permission removal is also a hint that other gwalks 70.473 * overlapping with this one may be inconsistent 70.474 @@ -2958,11 +3246,20 @@ static int sh_page_fault(struct vcpu *v, 70.475 flush_tlb_mask(d->domain_dirty_cpumask); 70.476 } 70.477 70.478 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.479 + /* Second bit set: Resynced a page. Re-walk needed. */ 70.480 + if ( rc & GW_RMWR_REWALK ) 70.481 + { 70.482 + shadow_unlock(d); 70.483 + goto rewalk; 70.484 + } 70.485 +#endif /* OOS */ 70.486 + 70.487 if ( !shadow_check_gwalk(v, va, &gw) ) 70.488 { 70.489 perfc_incr(shadow_inconsistent_gwalk); 70.490 shadow_unlock(d); 70.491 - return EXCRET_fault_fixed; 70.492 + goto rewalk; 70.493 } 70.494 70.495 shadow_audit_tables(v); 70.496 @@ -2991,17 +3288,45 @@ static int sh_page_fault(struct vcpu *v, 70.497 return 0; 70.498 } 70.499 70.500 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.501 + /* Always unsync when writing to L1 page tables. */ 70.502 + if ( sh_mfn_is_a_page_table(gmfn) 70.503 + && ft == ft_demand_write ) 70.504 + sh_unsync(v, gmfn, va); 70.505 +#endif /* OOS */ 70.506 + 70.507 /* Calculate the shadow entry and write it */ 70.508 l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt); 70.509 r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn); 70.510 70.511 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.512 + if ( mfn_valid(gw.l1mfn) 70.513 + && mfn_is_out_of_sync(gw.l1mfn) ) 70.514 + { 70.515 + /* Update the OOS snapshot. */ 70.516 + mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn); 70.517 + guest_l1e_t *snp; 70.518 + 70.519 + ASSERT(mfn_valid(snpmfn)); 70.520 + 70.521 + snp = sh_map_domain_page(snpmfn); 70.522 + snp[guest_l1_table_offset(va)] = gw.l1e; 70.523 + sh_unmap_domain_page(snp); 70.524 + } 70.525 +#endif /* OOS */ 70.526 + 70.527 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH 70.528 /* Prefetch some more shadow entries */ 70.529 sh_prefetch(v, &gw, ptr_sl1e, sl1mfn); 70.530 #endif 70.531 70.532 /* Need to emulate accesses to page tables */ 70.533 - if ( sh_mfn_is_a_page_table(gmfn) ) 70.534 + if ( sh_mfn_is_a_page_table(gmfn) 70.535 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.536 + /* Unless they've been allowed to go out of sync with their shadows */ 70.537 + && !mfn_is_out_of_sync(gmfn) 70.538 +#endif 70.539 + ) 70.540 { 70.541 if ( ft == ft_demand_write ) 70.542 { 70.543 @@ -3215,6 +3540,7 @@ sh_invlpg(struct vcpu *v, unsigned long 70.544 * instruction should be issued on the hardware, or 0 if it's safe not 70.545 * to do so. */ 70.546 { 70.547 + mfn_t sl1mfn; 70.548 shadow_l2e_t sl2e; 70.549 70.550 perfc_incr(shadow_invlpg); 70.551 @@ -3278,13 +3604,65 @@ sh_invlpg(struct vcpu *v, unsigned long 70.552 // If so, then we'll need to flush the entire TLB (because that's 70.553 // easier than invalidating all of the individual 4K pages). 70.554 // 70.555 - if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type 70.556 + sl1mfn = shadow_l2e_get_mfn(sl2e); 70.557 + if ( mfn_to_shadow_page(sl1mfn)->type 70.558 == SH_type_fl1_shadow ) 70.559 { 70.560 flush_tlb_local(); 70.561 return 0; 70.562 } 70.563 70.564 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.565 + /* Check to see if the SL1 is out of sync. */ 70.566 + { 70.567 + mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); 70.568 + struct page_info *pg = mfn_to_page(gl1mfn); 70.569 + if ( mfn_valid(gl1mfn) 70.570 + && page_is_out_of_sync(pg) ) 70.571 + { 70.572 + /* The test above may give false positives, since we don't 70.573 + * hold the shadow lock yet. Check again with the lock held. */ 70.574 + shadow_lock(v->domain); 70.575 + 70.576 + /* This must still be a copy-from-user because we didn't 70.577 + * have the shadow lock last time we checked, and the 70.578 + * higher-level shadows might have disappeared under our 70.579 + * feet. */ 70.580 + if ( __copy_from_user(&sl2e, 70.581 + sh_linear_l2_table(v) 70.582 + + shadow_l2_linear_offset(va), 70.583 + sizeof (sl2e)) != 0 ) 70.584 + { 70.585 + perfc_incr(shadow_invlpg_fault); 70.586 + shadow_unlock(v->domain); 70.587 + return 0; 70.588 + } 70.589 + 70.590 + if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) ) 70.591 + { 70.592 + shadow_unlock(v->domain); 70.593 + return 0; 70.594 + } 70.595 + 70.596 + sl1mfn = shadow_l2e_get_mfn(sl2e); 70.597 + gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); 70.598 + pg = mfn_to_page(gl1mfn); 70.599 + 70.600 + if ( likely(sh_mfn_is_a_page_table(gl1mfn) 70.601 + && page_is_out_of_sync(pg) ) ) 70.602 + { 70.603 + shadow_l1e_t *sl1; 70.604 + sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va); 70.605 + /* Remove the shadow entry that maps this VA */ 70.606 + (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn); 70.607 + } 70.608 + shadow_unlock(v->domain); 70.609 + /* Need the invlpg, to pick up the disappeareance of the sl1e */ 70.610 + return 1; 70.611 + } 70.612 + } 70.613 +#endif 70.614 + 70.615 return 1; 70.616 } 70.617 70.618 @@ -3710,6 +4088,13 @@ sh_update_cr3(struct vcpu *v, int do_loc 70.619 return; 70.620 } 70.621 70.622 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.623 + /* Need to resync all the shadow entries on a TLB flush. Resync 70.624 + * current vcpus OOS pages before switching to the new shadow 70.625 + * tables so that the VA hint is still valid. */ 70.626 + shadow_resync_current_vcpu(v, do_locking); 70.627 +#endif 70.628 + 70.629 if ( do_locking ) shadow_lock(v->domain); 70.630 70.631 ASSERT(shadow_locked_by_me(v->domain)); 70.632 @@ -3938,12 +4323,71 @@ sh_update_cr3(struct vcpu *v, int do_loc 70.633 70.634 /* Release the lock, if we took it (otherwise it's the caller's problem) */ 70.635 if ( do_locking ) shadow_unlock(v->domain); 70.636 + 70.637 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.638 + /* Need to resync all the shadow entries on a TLB flush. We only 70.639 + * update the shadows, leaving the pages out of sync. Also, we try 70.640 + * to skip synchronization of shadows not mapped in the new 70.641 + * tables. */ 70.642 + shadow_sync_other_vcpus(v, do_locking); 70.643 +#endif 70.644 + 70.645 } 70.646 70.647 70.648 /**************************************************************************/ 70.649 /* Functions to revoke guest rights */ 70.650 70.651 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC 70.652 +int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, 70.653 + mfn_t smfn, unsigned long off) 70.654 +{ 70.655 + int r; 70.656 + shadow_l1e_t *sl1p, sl1e; 70.657 + struct shadow_page_info *sp; 70.658 + 70.659 + ASSERT(mfn_valid(gmfn)); 70.660 + ASSERT(mfn_valid(smfn)); 70.661 + 70.662 + sp = mfn_to_shadow_page(smfn); 70.663 + 70.664 + if ( sp->mbz != 0 || 70.665 +#if GUEST_PAGING_LEVELS == 4 70.666 + (sp->type != SH_type_l1_64_shadow) 70.667 +#elif GUEST_PAGING_LEVELS == 3 70.668 + (sp->type != SH_type_l1_pae_shadow) 70.669 +#elif GUEST_PAGING_LEVELS == 2 70.670 + (sp->type != SH_type_l1_32_shadow) 70.671 +#endif 70.672 + ) 70.673 + goto fail; 70.674 + 70.675 + sl1p = sh_map_domain_page(smfn); 70.676 + sl1p += off; 70.677 + sl1e = *sl1p; 70.678 + if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW)) 70.679 + != (_PAGE_PRESENT|_PAGE_RW)) 70.680 + || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) ) 70.681 + { 70.682 + sh_unmap_domain_page(sl1p); 70.683 + goto fail; 70.684 + } 70.685 + 70.686 + /* Found it! Need to remove its write permissions. */ 70.687 + sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); 70.688 + r = shadow_set_l1e(v, sl1p, sl1e, smfn); 70.689 + ASSERT( !(r & SHADOW_SET_ERROR) ); 70.690 + 70.691 + sh_unmap_domain_page(sl1p); 70.692 + perfc_incr(shadow_writeable_h_7); 70.693 + return 1; 70.694 + 70.695 + fail: 70.696 + perfc_incr(shadow_writeable_h_8); 70.697 + return 0; 70.698 +} 70.699 +#endif /* OOS */ 70.700 + 70.701 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC 70.702 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn) 70.703 /* Look up this vaddr in the current shadow and see if it's a writeable 70.704 @@ -4437,23 +4881,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v, 70.705 70.706 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES 70.707 70.708 -#define AUDIT_FAIL(_level, _fmt, _a...) do { \ 70.709 - printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \ 70.710 - "gl" #_level "mfn = %" PRI_mfn \ 70.711 - " sl" #_level "mfn = %" PRI_mfn \ 70.712 - " &gl" #_level "e = %p &sl" #_level "e = %p" \ 70.713 - " gl" #_level "e = %" SH_PRI_gpte \ 70.714 - " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \ 70.715 - GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ 70.716 - _level, guest_index(gl ## _level ## e), \ 70.717 - mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ 70.718 - gl ## _level ## e, sl ## _level ## e, \ 70.719 - gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ 70.720 - ##_a); \ 70.721 - BUG(); \ 70.722 - done = 1; \ 70.723 +#define AUDIT_FAIL(_level, _fmt, _a...) do { \ 70.724 + printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \ 70.725 + "gl" #_level "mfn = %" PRI_mfn \ 70.726 + " sl" #_level "mfn = %" PRI_mfn \ 70.727 + " &gl" #_level "e = %p &sl" #_level "e = %p" \ 70.728 + " gl" #_level "e = %" SH_PRI_gpte \ 70.729 + " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \ 70.730 + GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ 70.731 + _level, guest_index(gl ## _level ## e), \ 70.732 + mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ 70.733 + gl ## _level ## e, sl ## _level ## e, \ 70.734 + gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ 70.735 + ##_a); \ 70.736 + BUG(); \ 70.737 + done = 1; \ 70.738 } while (0) 70.739 70.740 +#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \ 70.741 + printk("Shadow %u-on-%u audit failed at level %i\n" \ 70.742 + "gl" #_level "mfn = %" PRI_mfn \ 70.743 + " sl" #_level "mfn = %" PRI_mfn \ 70.744 + " Error: " _fmt "\n", \ 70.745 + GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ 70.746 + _level, \ 70.747 + mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ 70.748 + ##_a); \ 70.749 + BUG(); \ 70.750 + done = 1; \ 70.751 +} while (0) 70.752 70.753 static char * sh_audit_flags(struct vcpu *v, int level, 70.754 int gflags, int sflags) 70.755 @@ -4494,6 +4950,16 @@ int sh_audit_l1_table(struct vcpu *v, mf 70.756 70.757 /* Follow the backpointer */ 70.758 gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); 70.759 + 70.760 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.761 + /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */ 70.762 + if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) ) 70.763 + { 70.764 + oos_audit_hash_is_present(v->domain, gl1mfn); 70.765 + return 0; 70.766 + } 70.767 +#endif 70.768 + 70.769 gl1e = gp = sh_map_domain_page(gl1mfn); 70.770 SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, { 70.771 70.772 @@ -4574,6 +5040,13 @@ int sh_audit_l2_table(struct vcpu *v, mf 70.773 70.774 /* Follow the backpointer */ 70.775 gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer); 70.776 + 70.777 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.778 + /* Only L1's may be out of sync. */ 70.779 + if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) ) 70.780 + AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn)); 70.781 +#endif 70.782 + 70.783 gl2e = gp = sh_map_domain_page(gl2mfn); 70.784 SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, { 70.785 70.786 @@ -4616,6 +5089,13 @@ int sh_audit_l3_table(struct vcpu *v, mf 70.787 70.788 /* Follow the backpointer */ 70.789 gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer); 70.790 + 70.791 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.792 + /* Only L1's may be out of sync. */ 70.793 + if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) ) 70.794 + AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn)); 70.795 +#endif 70.796 + 70.797 gl3e = gp = sh_map_domain_page(gl3mfn); 70.798 SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, { 70.799 70.800 @@ -4656,6 +5136,13 @@ int sh_audit_l4_table(struct vcpu *v, mf 70.801 70.802 /* Follow the backpointer */ 70.803 gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer); 70.804 + 70.805 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 70.806 + /* Only L1's may be out of sync. */ 70.807 + if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) ) 70.808 + AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn)); 70.809 +#endif 70.810 + 70.811 gl4e = gp = sh_map_domain_page(gl4mfn); 70.812 SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain, 70.813 {
71.1 --- a/xen/arch/x86/mm/shadow/multi.h Thu Jun 19 12:48:04 2008 +0900 71.2 +++ b/xen/arch/x86/mm/shadow/multi.h Wed Jul 02 11:30:37 2008 +0900 71.3 @@ -115,3 +115,17 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_ 71.4 71.5 extern struct paging_mode 71.6 SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS); 71.7 + 71.8 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC 71.9 +extern void 71.10 +SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS) 71.11 + (struct vcpu *v, mfn_t gmfn, mfn_t snpmfn); 71.12 + 71.13 +extern int 71.14 +SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS) 71.15 + (struct vcpu*v, mfn_t gmfn); 71.16 + 71.17 +extern int 71.18 +SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p, GUEST_LEVELS) 71.19 + (struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off); 71.20 +#endif
72.1 --- a/xen/arch/x86/mm/shadow/private.h Thu Jun 19 12:48:04 2008 +0900 72.2 +++ b/xen/arch/x86/mm/shadow/private.h Wed Jul 02 11:30:37 2008 +0900 72.3 @@ -63,8 +63,9 @@ extern int shadow_audit_enable; 72.4 #define SHOPT_SKIP_VERIFY 0x20 /* Skip PTE v'fy when safe to do so */ 72.5 #define SHOPT_VIRTUAL_TLB 0x40 /* Cache guest v->p translations */ 72.6 #define SHOPT_FAST_EMULATION 0x80 /* Fast write emulation */ 72.7 +#define SHOPT_OUT_OF_SYNC 0x100 /* Allow guest writes to L1 PTs */ 72.8 72.9 -#define SHADOW_OPTIMIZATIONS 0xff 72.10 +#define SHADOW_OPTIMIZATIONS 0x1ff 72.11 72.12 72.13 /****************************************************************************** 72.14 @@ -195,9 +196,9 @@ struct shadow_page_info 72.15 u32 tlbflush_timestamp; 72.16 }; 72.17 struct { 72.18 - unsigned int type:4; /* What kind of shadow is this? */ 72.19 + unsigned int type:5; /* What kind of shadow is this? */ 72.20 unsigned int pinned:1; /* Is the shadow pinned? */ 72.21 - unsigned int count:27; /* Reference count */ 72.22 + unsigned int count:26; /* Reference count */ 72.23 u32 mbz; /* Must be zero: this is where the owner 72.24 * field lives in a non-shadow page */ 72.25 } __attribute__((packed)); 72.26 @@ -242,7 +243,8 @@ static inline void shadow_check_page_str 72.27 #define SH_type_max_shadow (13U) 72.28 #define SH_type_p2m_table (14U) /* in use as the p2m table */ 72.29 #define SH_type_monitor_table (15U) /* in use as a monitor table */ 72.30 -#define SH_type_unused (16U) 72.31 +#define SH_type_oos_snapshot (16U) /* in use as OOS snapshot */ 72.32 +#define SH_type_unused (17U) 72.33 72.34 /* 72.35 * What counts as a pinnable shadow? 72.36 @@ -301,6 +303,72 @@ static inline int sh_type_is_pinnable(st 72.37 #define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE) 72.38 #define SHF_64 (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64) 72.39 72.40 +#define SHF_L1_ANY (SHF_L1_32|SHF_L1_PAE|SHF_L1_64) 72.41 + 72.42 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 72.43 +/* Marks a guest L1 page table which is shadowed but not write-protected. 72.44 + * If set, then *only* L1 shadows (SHF_L1_*) are allowed. 72.45 + * 72.46 + * out_of_sync indicates that the shadow tables may not reflect the 72.47 + * guest tables. If it is clear, then the shadow tables *must* reflect 72.48 + * the guest tables. 72.49 + * 72.50 + * oos_may_write indicates that a page may have writable mappings. 72.51 + * 72.52 + * Most of the time the flags are synonymous. There is a short period of time 72.53 + * during resync that oos_may_write is clear but out_of_sync is not. If a 72.54 + * codepath is called during that time and is sensitive to oos issues, it may 72.55 + * need to use the second flag. 72.56 + */ 72.57 +#define SHF_out_of_sync (1u<<30) 72.58 +#define SHF_oos_may_write (1u<<29) 72.59 + 72.60 +/* Fixup tables are a non-complete writable-mappings reverse map for 72.61 + OOS pages. This let us quickly resync pages (avoiding brute-force 72.62 + search of the shadows) when the va hint is not sufficient (i.e., 72.63 + the pagetable is mapped in multiple places and in multiple 72.64 + shadows.) */ 72.65 +#define SHADOW_OOS_FT_ENTRIES \ 72.66 + ((PAGE_SIZE << SHADOW_OOS_FT_ORDER) \ 72.67 + / (SHADOW_OOS_FT_HASH * sizeof(struct oos_fixup))) 72.68 + 72.69 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ 72.70 + 72.71 +static inline int sh_page_has_multiple_shadows(struct page_info *pg) 72.72 +{ 72.73 + u32 shadows; 72.74 + if ( !(pg->count_info & PGC_page_table) ) 72.75 + return 0; 72.76 + shadows = pg->shadow_flags & SHF_page_type_mask; 72.77 + /* More than one type bit set in shadow-flags? */ 72.78 + return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 ); 72.79 +} 72.80 + 72.81 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 72.82 +/* The caller must verify this is reasonable to call; i.e., valid mfn, 72.83 + * domain is translated, &c */ 72.84 +static inline int page_is_out_of_sync(struct page_info *p) 72.85 +{ 72.86 + return (p->count_info & PGC_page_table) 72.87 + && (p->shadow_flags & SHF_out_of_sync); 72.88 +} 72.89 + 72.90 +static inline int mfn_is_out_of_sync(mfn_t gmfn) 72.91 +{ 72.92 + return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn))); 72.93 +} 72.94 + 72.95 +static inline int page_oos_may_write(struct page_info *p) 72.96 +{ 72.97 + return (p->count_info & PGC_page_table) 72.98 + && (p->shadow_flags & SHF_oos_may_write); 72.99 +} 72.100 + 72.101 +static inline int mfn_oos_may_write(mfn_t gmfn) 72.102 +{ 72.103 + return page_oos_may_write(mfn_to_page(mfn_x(gmfn))); 72.104 +} 72.105 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ 72.106 72.107 /****************************************************************************** 72.108 * Various function declarations 72.109 @@ -351,7 +419,57 @@ int shadow_write_guest_entry(struct vcpu 72.110 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p, 72.111 intpte_t *old, intpte_t new, mfn_t gmfn); 72.112 72.113 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 72.114 +/* Allow a shadowed page to go out of sync */ 72.115 +int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va); 72.116 72.117 +/* Pull an out-of-sync page back into sync. */ 72.118 +void sh_resync(struct vcpu *v, mfn_t gmfn); 72.119 + 72.120 +void oos_fixup_add(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off); 72.121 + 72.122 +int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, 72.123 + mfn_t smfn, unsigned long offset); 72.124 + 72.125 +/* Pull all out-of-sync shadows back into sync. If skip != 0, we try 72.126 + * to avoid resyncing where we think we can get away with it. */ 72.127 + 72.128 +void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking); 72.129 + 72.130 +static inline void 72.131 +shadow_resync_all(struct vcpu *v, int do_locking) 72.132 +{ 72.133 + sh_resync_all(v, 72.134 + 0 /* skip */, 72.135 + 1 /* this */, 72.136 + 1 /* others */, 72.137 + do_locking); 72.138 +} 72.139 + 72.140 +static inline void 72.141 +shadow_resync_current_vcpu(struct vcpu *v, int do_locking) 72.142 +{ 72.143 + sh_resync_all(v, 72.144 + 0 /* skip */, 72.145 + 1 /* this */, 72.146 + 0 /* others */, 72.147 + do_locking); 72.148 +} 72.149 + 72.150 +static inline void 72.151 +shadow_sync_other_vcpus(struct vcpu *v, int do_locking) 72.152 +{ 72.153 + sh_resync_all(v, 72.154 + 1 /* skip */, 72.155 + 0 /* this */, 72.156 + 1 /* others */, 72.157 + do_locking); 72.158 +} 72.159 + 72.160 +void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn); 72.161 +mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn); 72.162 + 72.163 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ 72.164 72.165 /****************************************************************************** 72.166 * Flags used in the return value of the shadow_set_lXe() functions...
73.1 --- a/xen/arch/x86/mm/shadow/types.h Thu Jun 19 12:48:04 2008 +0900 73.2 +++ b/xen/arch/x86/mm/shadow/types.h Wed Jul 02 11:30:37 2008 +0900 73.3 @@ -438,6 +438,11 @@ struct shadow_walk_t 73.4 #define sh_guess_wrmap INTERNAL_NAME(sh_guess_wrmap) 73.5 #define sh_clear_shadow_entry INTERNAL_NAME(sh_clear_shadow_entry) 73.6 73.7 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC 73.8 +#define sh_resync_l1 INTERNAL_NAME(sh_resync_l1) 73.9 +#define sh_safe_not_to_sync INTERNAL_NAME(sh_safe_not_to_sync) 73.10 +#define sh_rm_write_access_from_sl1p INTERNAL_NAME(sh_rm_write_access_from_sl1p) 73.11 +#endif 73.12 73.13 /* The sh_guest_(map|get)_* functions depends on Xen's paging levels */ 73.14 #define sh_guest_map_l1e \
74.1 --- a/xen/arch/x86/platform_hypercall.c Thu Jun 19 12:48:04 2008 +0900 74.2 +++ b/xen/arch/x86/platform_hypercall.c Wed Jul 02 11:30:37 2008 +0900 74.3 @@ -408,7 +408,12 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe 74.4 cpu_count++; 74.5 } 74.6 if ( cpu_count == num_online_cpus() ) 74.7 - ret = acpi_cpufreq_init(); 74.8 + { 74.9 + if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) 74.10 + ret = powernow_cpufreq_init(); 74.11 + else 74.12 + ret = acpi_cpufreq_init(); 74.13 + } 74.14 break; 74.15 } 74.16
75.1 --- a/xen/arch/x86/x86_emulate/x86_emulate.c Thu Jun 19 12:48:04 2008 +0900 75.2 +++ b/xen/arch/x86/x86_emulate/x86_emulate.c Wed Jul 02 11:30:37 2008 +0900 75.3 @@ -142,12 +142,14 @@ static uint8_t opcode_table[256] = { 75.4 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 75.5 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 75.6 /* 0xD0 - 0xD7 */ 75.7 - ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 75.8 - ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 75.9 + ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 75.10 + ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 75.11 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 75.12 /* 0xD8 - 0xDF */ 75.13 - 0, ImplicitOps|ModRM|Mov, 0, ImplicitOps|ModRM|Mov, 75.14 - 0, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, 75.15 + ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, 75.16 + ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, 75.17 + ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, 75.18 + ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, 75.19 /* 0xE0 - 0xE7 */ 75.20 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 75.21 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 75.22 @@ -216,7 +218,7 @@ static uint8_t twobyte_table[256] = { 75.23 ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, 75.24 /* 0xA0 - 0xA7 */ 75.25 ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM, 75.26 - DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0, 75.27 + DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0, 75.28 /* 0xA8 - 0xAF */ 75.29 ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM, 75.30 DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, DstReg|SrcMem|ModRM, 75.31 @@ -246,8 +248,20 @@ static uint8_t twobyte_table[256] = { 75.32 /* Type, address-of, and value of an instruction's operand. */ 75.33 struct operand { 75.34 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 75.35 - unsigned int bytes; 75.36 - unsigned long val, orig_val; 75.37 + unsigned int bytes; 75.38 + 75.39 + /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */ 75.40 + union { 75.41 + unsigned long val; 75.42 + uint32_t bigval[4]; 75.43 + }; 75.44 + 75.45 + /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */ 75.46 + union { 75.47 + unsigned long orig_val; 75.48 + uint32_t orig_bigval[4]; 75.49 + }; 75.50 + 75.51 union { 75.52 /* OP_REG: Pointer to register field. */ 75.53 unsigned long *reg; 75.54 @@ -466,7 +480,7 @@ do{ asm volatile ( 75.55 75.56 /* Fetch next part of the instruction being emulated. */ 75.57 #define insn_fetch_bytes(_size) \ 75.58 -({ unsigned long _x, _eip = _regs.eip; \ 75.59 +({ unsigned long _x = 0, _eip = _regs.eip; \ 75.60 if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \ 75.61 _regs.eip += (_size); /* real hardware doesn't truncate */ \ 75.62 generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15, \ 75.63 @@ -594,6 +608,18 @@ do{ struct fpu_insn_ctxt fic; 75.64 put_fpu(&fic); \ 75.65 } while (0) 75.66 75.67 +#define emulate_fpu_insn_memsrc(_op, _arg) \ 75.68 +do{ struct fpu_insn_ctxt fic; \ 75.69 + get_fpu(X86EMUL_FPU_fpu, &fic); \ 75.70 + asm volatile ( \ 75.71 + "movb $2f-1f,%0 \n" \ 75.72 + "1: " _op " %1 \n" \ 75.73 + "2: \n" \ 75.74 + : "=m" (fic.insn_bytes) \ 75.75 + : "m" (_arg) : "memory" ); \ 75.76 + put_fpu(&fic); \ 75.77 +} while (0) 75.78 + 75.79 #define emulate_fpu_insn_stub(_bytes...) \ 75.80 do{ uint8_t stub[] = { _bytes, 0xc3 }; \ 75.81 struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; \ 75.82 @@ -655,6 +681,19 @@ static void __put_rep_prefix( 75.83 __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \ 75.84 }) 75.85 75.86 +/* Compatibility function: read guest memory, zero-extend result to a ulong. */ 75.87 +static int read_ulong( 75.88 + enum x86_segment seg, 75.89 + unsigned long offset, 75.90 + unsigned long *val, 75.91 + unsigned int bytes, 75.92 + struct x86_emulate_ctxt *ctxt, 75.93 + struct x86_emulate_ops *ops) 75.94 +{ 75.95 + *val = 0; 75.96 + return ops->read(seg, offset, val, bytes, ctxt); 75.97 +} 75.98 + 75.99 /* 75.100 * Unsigned multiplication with double-word result. 75.101 * IN: Multiplicand=m[0], Multiplier=m[1] 75.102 @@ -841,7 +880,8 @@ static int ioport_access_check( 75.103 (tr.limit < 0x67) ) 75.104 goto raise_exception; 75.105 75.106 - if ( (rc = ops->read(x86_seg_none, tr.base + 0x66, &iobmp, 2, ctxt)) ) 75.107 + if ( (rc = read_ulong(x86_seg_none, tr.base + 0x66, 75.108 + &iobmp, 2, ctxt, ops)) ) 75.109 return rc; 75.110 75.111 /* Ensure TSS includes two bytes including byte containing first port. */ 75.112 @@ -849,7 +889,8 @@ static int ioport_access_check( 75.113 if ( tr.limit <= iobmp ) 75.114 goto raise_exception; 75.115 75.116 - if ( (rc = ops->read(x86_seg_none, tr.base + iobmp, &iobmp, 2, ctxt)) ) 75.117 + if ( (rc = read_ulong(x86_seg_none, tr.base + iobmp, 75.118 + &iobmp, 2, ctxt, ops)) ) 75.119 return rc; 75.120 if ( (iobmp & (((1<<bytes)-1) << (first_port&7))) != 0 ) 75.121 goto raise_exception; 75.122 @@ -941,12 +982,12 @@ protmode_load_seg( 75.123 goto raise_exn; 75.124 75.125 do { 75.126 - if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8), 75.127 - &val, 4, ctxt)) ) 75.128 + if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8), 75.129 + &val, 4, ctxt, ops)) ) 75.130 return rc; 75.131 desc.a = val; 75.132 - if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8) + 4, 75.133 - &val, 4, ctxt)) ) 75.134 + if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8) + 4, 75.135 + &val, 4, ctxt, ops)) ) 75.136 return rc; 75.137 desc.b = val; 75.138 75.139 @@ -992,14 +1033,15 @@ protmode_load_seg( 75.140 if ( (desc.b & (5u<<9)) == (4u<<9) ) 75.141 goto raise_exn; 75.142 /* Non-conforming segment: check DPL against RPL and CPL. */ 75.143 - if ( ((desc.b & (6u<<9)) != (6u<<9)) && ((dpl < cpl) || (dpl < rpl)) ) 75.144 + if ( ((desc.b & (6u<<9)) != (6u<<9)) && 75.145 + ((dpl < cpl) || (dpl < rpl)) ) 75.146 goto raise_exn; 75.147 break; 75.148 } 75.149 75.150 /* Ensure Accessed flag is set. */ 75.151 new_desc_b = desc.b | 0x100; 75.152 - rc = ((desc.b & 0x100) ? X86EMUL_OKAY : 75.153 + rc = ((desc.b & 0x100) ? X86EMUL_OKAY : 75.154 ops->cmpxchg( 75.155 x86_seg_none, desctab.base + (sel & 0xfff8) + 4, 75.156 &desc.b, &new_desc_b, 4, ctxt)); 75.157 @@ -1061,16 +1103,16 @@ decode_register( 75.158 case 2: p = ®s->edx; break; 75.159 case 3: p = ®s->ebx; break; 75.160 case 4: p = (highbyte_regs ? 75.161 - ((unsigned char *)®s->eax + 1) : 75.162 + ((unsigned char *)®s->eax + 1) : 75.163 (unsigned char *)®s->esp); break; 75.164 case 5: p = (highbyte_regs ? 75.165 - ((unsigned char *)®s->ecx + 1) : 75.166 + ((unsigned char *)®s->ecx + 1) : 75.167 (unsigned char *)®s->ebp); break; 75.168 case 6: p = (highbyte_regs ? 75.169 - ((unsigned char *)®s->edx + 1) : 75.170 + ((unsigned char *)®s->edx + 1) : 75.171 (unsigned char *)®s->esi); break; 75.172 case 7: p = (highbyte_regs ? 75.173 - ((unsigned char *)®s->ebx + 1) : 75.174 + ((unsigned char *)®s->ebx + 1) : 75.175 (unsigned char *)®s->edi); break; 75.176 #if defined(__x86_64__) 75.177 case 8: p = ®s->r8; break; 75.178 @@ -1402,8 +1444,8 @@ x86_emulate( 75.179 case 8: src.val = *(uint64_t *)src.reg; break; 75.180 } 75.181 } 75.182 - else if ( (rc = ops->read(src.mem.seg, src.mem.off, 75.183 - &src.val, src.bytes, ctxt)) ) 75.184 + else if ( (rc = read_ulong(src.mem.seg, src.mem.off, 75.185 + &src.val, src.bytes, ctxt, ops)) ) 75.186 goto done; 75.187 break; 75.188 case SrcImm: 75.189 @@ -1494,8 +1536,8 @@ x86_emulate( 75.190 } 75.191 else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */ 75.192 { 75.193 - if ( (rc = ops->read(dst.mem.seg, dst.mem.off, 75.194 - &dst.val, dst.bytes, ctxt)) ) 75.195 + if ( (rc = read_ulong(dst.mem.seg, dst.mem.off, 75.196 + &dst.val, dst.bytes, ctxt, ops)) ) 75.197 goto done; 75.198 dst.orig_val = dst.val; 75.199 } 75.200 @@ -1571,8 +1613,8 @@ x86_emulate( 75.201 int lb, ub, idx; 75.202 generate_exception_if(mode_64bit() || (src.type != OP_MEM), 75.203 EXC_UD, -1); 75.204 - if ( (rc = ops->read(src.mem.seg, src.mem.off + op_bytes, 75.205 - &src_val2, op_bytes, ctxt)) ) 75.206 + if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes, 75.207 + &src_val2, op_bytes, ctxt, ops)) ) 75.208 goto done; 75.209 ub = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2; 75.210 lb = (op_bytes == 2) ? (int16_t)src.val : (int32_t)src.val; 75.211 @@ -1588,8 +1630,8 @@ x86_emulate( 75.212 /* movsxd */ 75.213 if ( src.type == OP_REG ) 75.214 src.val = *(int32_t *)src.reg; 75.215 - else if ( (rc = ops->read(src.mem.seg, src.mem.off, 75.216 - &src.val, 4, ctxt)) ) 75.217 + else if ( (rc = read_ulong(src.mem.seg, src.mem.off, 75.218 + &src.val, 4, ctxt, ops)) ) 75.219 goto done; 75.220 dst.val = (int32_t)src.val; 75.221 } 75.222 @@ -1613,8 +1655,8 @@ x86_emulate( 75.223 unsigned long src1; /* ModR/M source operand */ 75.224 if ( ea.type == OP_REG ) 75.225 src1 = *ea.reg; 75.226 - else if ( (rc = ops->read(ea.mem.seg, ea.mem.off, 75.227 - &src1, op_bytes, ctxt)) ) 75.228 + else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, 75.229 + &src1, op_bytes, ctxt, ops)) ) 75.230 goto done; 75.231 _regs.eflags &= ~(EFLG_OF|EFLG_CF); 75.232 switch ( dst.bytes ) 75.233 @@ -1720,8 +1762,8 @@ x86_emulate( 75.234 /* 64-bit mode: POP defaults to a 64-bit operand. */ 75.235 if ( mode_64bit() && (dst.bytes == 4) ) 75.236 dst.bytes = 8; 75.237 - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes), 75.238 - &dst.val, dst.bytes, ctxt)) != 0 ) 75.239 + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes), 75.240 + &dst.val, dst.bytes, ctxt, ops)) != 0 ) 75.241 goto done; 75.242 break; 75.243 75.244 @@ -1773,8 +1815,8 @@ x86_emulate( 75.245 dst.val = x86_seg_es; 75.246 les: /* dst.val identifies the segment */ 75.247 generate_exception_if(src.type != OP_MEM, EXC_UD, -1); 75.248 - if ( (rc = ops->read(src.mem.seg, src.mem.off + src.bytes, 75.249 - &sel, 2, ctxt)) != 0 ) 75.250 + if ( (rc = read_ulong(src.mem.seg, src.mem.off + src.bytes, 75.251 + &sel, 2, ctxt, ops)) != 0 ) 75.252 goto done; 75.253 if ( (rc = load_seg(dst.val, (uint16_t)sel, ctxt, ops)) != 0 ) 75.254 goto done; 75.255 @@ -2020,8 +2062,8 @@ x86_emulate( 75.256 dst.bytes = op_bytes = 8; 75.257 if ( dst.type == OP_REG ) 75.258 dst.val = *dst.reg; 75.259 - else if ( (rc = ops->read(dst.mem.seg, dst.mem.off, 75.260 - &dst.val, 8, ctxt)) != 0 ) 75.261 + else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off, 75.262 + &dst.val, 8, ctxt, ops)) != 0 ) 75.263 goto done; 75.264 } 75.265 src.val = _regs.eip; 75.266 @@ -2036,8 +2078,8 @@ x86_emulate( 75.267 75.268 generate_exception_if(dst.type != OP_MEM, EXC_UD, -1); 75.269 75.270 - if ( (rc = ops->read(dst.mem.seg, dst.mem.off+dst.bytes, 75.271 - &sel, 2, ctxt)) ) 75.272 + if ( (rc = read_ulong(dst.mem.seg, dst.mem.off+dst.bytes, 75.273 + &sel, 2, ctxt, ops)) ) 75.274 goto done; 75.275 75.276 if ( (modrm_reg & 7) == 3 ) /* call */ 75.277 @@ -2046,9 +2088,9 @@ x86_emulate( 75.278 fail_if(ops->read_segment == NULL); 75.279 if ( (rc = ops->read_segment(x86_seg_cs, ®, ctxt)) || 75.280 (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), 75.281 - reg.sel, op_bytes, ctxt)) || 75.282 + ®.sel, op_bytes, ctxt)) || 75.283 (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), 75.284 - _regs.eip, op_bytes, ctxt)) ) 75.285 + &_regs.eip, op_bytes, ctxt)) ) 75.286 goto done; 75.287 } 75.288 75.289 @@ -2066,12 +2108,12 @@ x86_emulate( 75.290 dst.bytes = 8; 75.291 if ( dst.type == OP_REG ) 75.292 dst.val = *dst.reg; 75.293 - else if ( (rc = ops->read(dst.mem.seg, dst.mem.off, 75.294 - &dst.val, 8, ctxt)) != 0 ) 75.295 + else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off, 75.296 + &dst.val, 8, ctxt, ops)) != 0 ) 75.297 goto done; 75.298 } 75.299 if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), 75.300 - dst.val, dst.bytes, ctxt)) != 0 ) 75.301 + &dst.val, dst.bytes, ctxt)) != 0 ) 75.302 goto done; 75.303 dst.type = OP_NONE; 75.304 break; 75.305 @@ -2106,7 +2148,7 @@ x86_emulate( 75.306 &dst.val, dst.bytes, ctxt); 75.307 else 75.308 rc = ops->write( 75.309 - dst.mem.seg, dst.mem.off, dst.val, dst.bytes, ctxt); 75.310 + dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt); 75.311 if ( rc != 0 ) 75.312 goto done; 75.313 default: 75.314 @@ -2153,7 +2195,7 @@ x86_emulate( 75.315 if ( mode_64bit() && (op_bytes == 4) ) 75.316 op_bytes = 8; 75.317 if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), 75.318 - reg.sel, op_bytes, ctxt)) != 0 ) 75.319 + ®.sel, op_bytes, ctxt)) != 0 ) 75.320 goto done; 75.321 break; 75.322 } 75.323 @@ -2165,8 +2207,8 @@ x86_emulate( 75.324 /* 64-bit mode: POP defaults to a 64-bit operand. */ 75.325 if ( mode_64bit() && (op_bytes == 4) ) 75.326 op_bytes = 8; 75.327 - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), 75.328 - &dst.val, op_bytes, ctxt)) != 0 ) 75.329 + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), 75.330 + &dst.val, op_bytes, ctxt, ops)) != 0 ) 75.331 goto done; 75.332 if ( (rc = load_seg(src.val, (uint16_t)dst.val, ctxt, ops)) != 0 ) 75.333 return rc; 75.334 @@ -2275,8 +2317,8 @@ x86_emulate( 75.335 dst.bytes = op_bytes; 75.336 if ( mode_64bit() && (dst.bytes == 4) ) 75.337 dst.bytes = 8; 75.338 - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes), 75.339 - &dst.val, dst.bytes, ctxt)) != 0 ) 75.340 + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes), 75.341 + &dst.val, dst.bytes, ctxt, ops)) != 0 ) 75.342 goto done; 75.343 break; 75.344 75.345 @@ -2288,7 +2330,7 @@ x86_emulate( 75.346 generate_exception_if(mode_64bit(), EXC_UD, -1); 75.347 for ( i = 0; i < 8; i++ ) 75.348 if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), 75.349 - regs[i], op_bytes, ctxt)) != 0 ) 75.350 + ®s[i], op_bytes, ctxt)) != 0 ) 75.351 goto done; 75.352 break; 75.353 } 75.354 @@ -2303,8 +2345,8 @@ x86_emulate( 75.355 generate_exception_if(mode_64bit(), EXC_UD, -1); 75.356 for ( i = 0; i < 8; i++ ) 75.357 { 75.358 - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), 75.359 - &dst.val, op_bytes, ctxt)) != 0 ) 75.360 + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), 75.361 + &dst.val, op_bytes, ctxt, ops)) != 0 ) 75.362 goto done; 75.363 switch ( op_bytes ) 75.364 { 75.365 @@ -2382,8 +2424,8 @@ x86_emulate( 75.366 } 75.367 else 75.368 { 75.369 - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), 75.370 - &dst.val, dst.bytes, ctxt)) != 0 ) 75.371 + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), 75.372 + &dst.val, dst.bytes, ctxt, ops)) != 0 ) 75.373 goto done; 75.374 fail_if(ops->write_io == NULL); 75.375 if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 ) 75.376 @@ -2455,9 +2497,9 @@ x86_emulate( 75.377 75.378 if ( (rc = ops->read_segment(x86_seg_cs, ®, ctxt)) || 75.379 (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), 75.380 - reg.sel, op_bytes, ctxt)) || 75.381 + ®.sel, op_bytes, ctxt)) || 75.382 (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), 75.383 - _regs.eip, op_bytes, ctxt)) ) 75.384 + &_regs.eip, op_bytes, ctxt)) ) 75.385 goto done; 75.386 75.387 if ( (rc = load_seg(x86_seg_cs, sel, ctxt, ops)) != 0 ) 75.388 @@ -2483,8 +2525,8 @@ x86_emulate( 75.389 /* 64-bit mode: POP defaults to a 64-bit operand. */ 75.390 if ( mode_64bit() && (op_bytes == 4) ) 75.391 op_bytes = 8; 75.392 - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), 75.393 - &dst.val, op_bytes, ctxt)) != 0 ) 75.394 + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), 75.395 + &dst.val, op_bytes, ctxt, ops)) != 0 ) 75.396 goto done; 75.397 if ( op_bytes == 2 ) 75.398 dst.val = (uint16_t)dst.val | (_regs.eflags & 0xffff0000u); 75.399 @@ -2507,8 +2549,8 @@ x86_emulate( 75.400 dst.type = OP_REG; 75.401 dst.reg = (unsigned long *)&_regs.eax; 75.402 dst.bytes = (d & ByteOp) ? 1 : op_bytes; 75.403 - if ( (rc = ops->read(ea.mem.seg, insn_fetch_bytes(ad_bytes), 75.404 - &dst.val, dst.bytes, ctxt)) != 0 ) 75.405 + if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes), 75.406 + &dst.val, dst.bytes, ctxt, ops)) != 0 ) 75.407 goto done; 75.408 break; 75.409 75.410 @@ -2536,8 +2578,8 @@ x86_emulate( 75.411 } 75.412 else 75.413 { 75.414 - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), 75.415 - &dst.val, dst.bytes, ctxt)) != 0 ) 75.416 + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), 75.417 + &dst.val, dst.bytes, ctxt, ops)) != 0 ) 75.418 goto done; 75.419 dst.type = OP_MEM; 75.420 nr_reps = 1; 75.421 @@ -2556,10 +2598,10 @@ x86_emulate( 75.422 unsigned long next_eip = _regs.eip; 75.423 get_rep_prefix(); 75.424 src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes; 75.425 - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), 75.426 - &dst.val, dst.bytes, ctxt)) || 75.427 - (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi), 75.428 - &src.val, src.bytes, ctxt)) ) 75.429 + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), 75.430 + &dst.val, dst.bytes, ctxt, ops)) || 75.431 + (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi), 75.432 + &src.val, src.bytes, ctxt, ops)) ) 75.433 goto done; 75.434 register_address_increment( 75.435 _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); 75.436 @@ -2592,8 +2634,8 @@ x86_emulate( 75.437 dst.type = OP_REG; 75.438 dst.bytes = (d & ByteOp) ? 1 : op_bytes; 75.439 dst.reg = (unsigned long *)&_regs.eax; 75.440 - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), 75.441 - &dst.val, dst.bytes, ctxt)) != 0 ) 75.442 + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), 75.443 + &dst.val, dst.bytes, ctxt, ops)) != 0 ) 75.444 goto done; 75.445 register_address_increment( 75.446 _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); 75.447 @@ -2606,8 +2648,8 @@ x86_emulate( 75.448 get_rep_prefix(); 75.449 src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes; 75.450 dst.val = _regs.eax; 75.451 - if ( (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi), 75.452 - &src.val, src.bytes, ctxt)) != 0 ) 75.453 + if ( (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi), 75.454 + &src.val, src.bytes, ctxt, ops)) != 0 ) 75.455 goto done; 75.456 register_address_increment( 75.457 _regs.edi, (_regs.eflags & EFLG_DF) ? -src.bytes : src.bytes); 75.458 @@ -2624,8 +2666,8 @@ x86_emulate( 75.459 case 0xc3: /* ret (near) */ { 75.460 int offset = (b == 0xc2) ? insn_fetch_type(uint16_t) : 0; 75.461 op_bytes = mode_64bit() ? 8 : op_bytes; 75.462 - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset), 75.463 - &dst.val, op_bytes, ctxt)) != 0 ) 75.464 + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset), 75.465 + &dst.val, op_bytes, ctxt, ops)) != 0 ) 75.466 goto done; 75.467 _regs.eip = dst.val; 75.468 break; 75.469 @@ -2640,7 +2682,7 @@ x86_emulate( 75.470 dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes; 75.471 dst.reg = (unsigned long *)&_regs.ebp; 75.472 if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), 75.473 - _regs.ebp, dst.bytes, ctxt)) ) 75.474 + &_regs.ebp, dst.bytes, ctxt)) ) 75.475 goto done; 75.476 dst.val = _regs.esp; 75.477 75.478 @@ -2650,14 +2692,14 @@ x86_emulate( 75.479 { 75.480 unsigned long ebp, temp_data; 75.481 ebp = truncate_word(_regs.ebp - i*dst.bytes, ctxt->sp_size/8); 75.482 - if ( (rc = ops->read(x86_seg_ss, ebp, 75.483 - &temp_data, dst.bytes, ctxt)) || 75.484 + if ( (rc = read_ulong(x86_seg_ss, ebp, 75.485 + &temp_data, dst.bytes, ctxt, ops)) || 75.486 (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), 75.487 - temp_data, dst.bytes, ctxt)) ) 75.488 + &temp_data, dst.bytes, ctxt)) ) 75.489 goto done; 75.490 } 75.491 if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), 75.492 - dst.val, dst.bytes, ctxt)) ) 75.493 + &dst.val, dst.bytes, ctxt)) ) 75.494 goto done; 75.495 } 75.496 75.497 @@ -2683,8 +2725,8 @@ x86_emulate( 75.498 75.499 /* Second writeback, to %%ebp. */ 75.500 dst.reg = (unsigned long *)&_regs.ebp; 75.501 - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes), 75.502 - &dst.val, dst.bytes, ctxt)) ) 75.503 + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes), 75.504 + &dst.val, dst.bytes, ctxt, ops)) ) 75.505 goto done; 75.506 break; 75.507 75.508 @@ -2692,10 +2734,10 @@ x86_emulate( 75.509 case 0xcb: /* ret (far) */ { 75.510 int offset = (b == 0xca) ? insn_fetch_type(uint16_t) : 0; 75.511 op_bytes = mode_64bit() ? 8 : op_bytes; 75.512 - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), 75.513 - &dst.val, op_bytes, ctxt)) || 75.514 - (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset), 75.515 - &src.val, op_bytes, ctxt)) || 75.516 + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), 75.517 + &dst.val, op_bytes, ctxt, ops)) || 75.518 + (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset), 75.519 + &src.val, op_bytes, ctxt, ops)) || 75.520 (rc = load_seg(x86_seg_cs, (uint16_t)src.val, ctxt, ops)) ) 75.521 goto done; 75.522 _regs.eip = dst.val; 75.523 @@ -2729,12 +2771,12 @@ x86_emulate( 75.524 if ( !mode_iopl() ) 75.525 mask |= EFLG_IF; 75.526 fail_if(!in_realmode(ctxt, ops)); 75.527 - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), 75.528 - &eip, op_bytes, ctxt)) || 75.529 - (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), 75.530 - &cs, op_bytes, ctxt)) || 75.531 - (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), 75.532 - &eflags, op_bytes, ctxt)) ) 75.533 + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), 75.534 + &eip, op_bytes, ctxt, ops)) || 75.535 + (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), 75.536 + &cs, op_bytes, ctxt, ops)) || 75.537 + (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), 75.538 + &eflags, op_bytes, ctxt, ops)) ) 75.539 goto done; 75.540 if ( op_bytes == 2 ) 75.541 eflags = (uint16_t)eflags | (_regs.eflags & 0xffff0000u); 75.542 @@ -2779,13 +2821,65 @@ x86_emulate( 75.543 75.544 case 0xd7: /* xlat */ { 75.545 unsigned long al = (uint8_t)_regs.eax; 75.546 - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.ebx + al), 75.547 - &al, 1, ctxt)) != 0 ) 75.548 + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.ebx + al), 75.549 + &al, 1, ctxt, ops)) != 0 ) 75.550 goto done; 75.551 *(uint8_t *)&_regs.eax = al; 75.552 break; 75.553 } 75.554 75.555 + case 0xd8: /* FPU 0xd8 */ 75.556 + switch ( modrm ) 75.557 + { 75.558 + case 0xc0 ... 0xc7: /* fadd %stN,%stN */ 75.559 + case 0xc8 ... 0xcf: /* fmul %stN,%stN */ 75.560 + case 0xd0 ... 0xd7: /* fcom %stN,%stN */ 75.561 + case 0xd8 ... 0xdf: /* fcomp %stN,%stN */ 75.562 + case 0xe0 ... 0xe7: /* fsub %stN,%stN */ 75.563 + case 0xe8 ... 0xef: /* fsubr %stN,%stN */ 75.564 + case 0xf0 ... 0xf7: /* fdiv %stN,%stN */ 75.565 + case 0xf8 ... 0xff: /* fdivr %stN,%stN */ 75.566 + emulate_fpu_insn_stub(0xd8, modrm); 75.567 + break; 75.568 + default: 75.569 + fail_if(modrm >= 0xc0); 75.570 + ea.bytes = 4; 75.571 + src = ea; 75.572 + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, 75.573 + src.bytes, ctxt)) != 0 ) 75.574 + goto done; 75.575 + switch ( modrm_reg & 7 ) 75.576 + { 75.577 + case 0: /* fadd */ 75.578 + emulate_fpu_insn_memsrc("fadds", src.val); 75.579 + break; 75.580 + case 1: /* fmul */ 75.581 + emulate_fpu_insn_memsrc("fmuls", src.val); 75.582 + break; 75.583 + case 2: /* fcom */ 75.584 + emulate_fpu_insn_memsrc("fcoms", src.val); 75.585 + break; 75.586 + case 3: /* fcomp */ 75.587 + emulate_fpu_insn_memsrc("fcomps", src.val); 75.588 + break; 75.589 + case 4: /* fsub */ 75.590 + emulate_fpu_insn_memsrc("fsubs", src.val); 75.591 + break; 75.592 + case 5: /* fsubr */ 75.593 + emulate_fpu_insn_memsrc("fsubrs", src.val); 75.594 + break; 75.595 + case 6: /* fdiv */ 75.596 + emulate_fpu_insn_memsrc("fdivs", src.val); 75.597 + break; 75.598 + case 7: /* fdivr */ 75.599 + emulate_fpu_insn_memsrc("fdivrs", src.val); 75.600 + break; 75.601 + default: 75.602 + goto cannot_emulate; 75.603 + } 75.604 + } 75.605 + break; 75.606 + 75.607 case 0xd9: /* FPU 0xd9 */ 75.608 switch ( modrm ) 75.609 { 75.610 @@ -2822,28 +2916,269 @@ x86_emulate( 75.611 emulate_fpu_insn_stub(0xd9, modrm); 75.612 break; 75.613 default: 75.614 - fail_if((modrm_reg & 7) != 7); 75.615 fail_if(modrm >= 0xc0); 75.616 - /* fnstcw m2byte */ 75.617 - ea.bytes = 2; 75.618 - dst = ea; 75.619 - emulate_fpu_insn_memdst("fnstcw", dst.val); 75.620 + switch ( modrm_reg & 7 ) 75.621 + { 75.622 + case 0: /* fld m32fp */ 75.623 + ea.bytes = 4; 75.624 + src = ea; 75.625 + if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val, 75.626 + src.bytes, ctxt)) != 0 ) 75.627 + goto done; 75.628 + emulate_fpu_insn_memsrc("flds", src.val); 75.629 + break; 75.630 + case 2: /* fstp m32fp */ 75.631 + ea.bytes = 4; 75.632 + dst = ea; 75.633 + dst.type = OP_MEM; 75.634 + emulate_fpu_insn_memdst("fsts", dst.val); 75.635 + break; 75.636 + case 3: /* fstp m32fp */ 75.637 + ea.bytes = 4; 75.638 + dst = ea; 75.639 + dst.type = OP_MEM; 75.640 + emulate_fpu_insn_memdst("fstps", dst.val); 75.641 + break; 75.642 + /* case 4: fldenv - TODO */ 75.643 + case 5: /* fldcw m2byte */ 75.644 + ea.bytes = 2; 75.645 + src = ea; 75.646 + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, 75.647 + src.bytes, ctxt)) != 0 ) 75.648 + goto done; 75.649 + emulate_fpu_insn_memsrc("fldcw", src.val); 75.650 + break; 75.651 + /* case 6: fstenv - TODO */ 75.652 + case 7: /* fnstcw m2byte */ 75.653 + ea.bytes = 2; 75.654 + dst = ea; 75.655 + dst.type = OP_MEM; 75.656 + emulate_fpu_insn_memdst("fnstcw", dst.val); 75.657 + break; 75.658 + default: 75.659 + goto cannot_emulate; 75.660 + } 75.661 + } 75.662 + break; 75.663 + 75.664 + case 0xda: /* FPU 0xda */ 75.665 + switch ( modrm ) 75.666 + { 75.667 + case 0xc0 ... 0xc7: /* fcmovb %stN */ 75.668 + case 0xc8 ... 0xcf: /* fcmove %stN */ 75.669 + case 0xd0 ... 0xd7: /* fcmovbe %stN */ 75.670 + case 0xd8 ... 0xdf: /* fcmovu %stN */ 75.671 + case 0xe9: /* fucompp */ 75.672 + emulate_fpu_insn_stub(0xda, modrm); 75.673 + break; 75.674 + default: 75.675 + fail_if(modrm >= 0xc0); 75.676 + ea.bytes = 8; 75.677 + src = ea; 75.678 + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, 75.679 + src.bytes, ctxt)) != 0 ) 75.680 + goto done; 75.681 + switch ( modrm_reg & 7 ) 75.682 + { 75.683 + case 0: /* fiadd m64i */ 75.684 + emulate_fpu_insn_memsrc("fiaddl", src.val); 75.685 + break; 75.686 + case 1: /* fimul m64i */ 75.687 + emulate_fpu_insn_memsrc("fimul", src.val); 75.688 + break; 75.689 + case 2: /* ficom m64i */ 75.690 + emulate_fpu_insn_memsrc("ficoml", src.val); 75.691 + break; 75.692 + case 3: /* ficomp m64i */ 75.693 + emulate_fpu_insn_memsrc("ficompl", src.val); 75.694 + break; 75.695 + case 4: /* fisub m64i */ 75.696 + emulate_fpu_insn_memsrc("fisubl", src.val); 75.697 + break; 75.698 + case 5: /* fisubr m64i */ 75.699 + emulate_fpu_insn_memsrc("fisubrl", src.val); 75.700 + break; 75.701 + case 6: /* fidiv m64i */ 75.702 + emulate_fpu_insn_memsrc("fidivl", src.val); 75.703 + break; 75.704 + case 7: /* fidivr m64i */ 75.705 + emulate_fpu_insn_memsrc("fidivrl", src.val); 75.706 + break; 75.707 + default: 75.708 + goto cannot_emulate; 75.709 + } 75.710 } 75.711 break; 75.712 75.713 case 0xdb: /* FPU 0xdb */ 75.714 - fail_if(modrm != 0xe3); 75.715 - /* fninit */ 75.716 - emulate_fpu_insn("fninit"); 75.717 + switch ( modrm ) 75.718 + { 75.719 + case 0xc0 ... 0xc7: /* fcmovnb %stN */ 75.720 + case 0xc8 ... 0xcf: /* fcmovne %stN */ 75.721 + case 0xd0 ... 0xd7: /* fcmovnbe %stN */ 75.722 + case 0xd8 ... 0xdf: /* fcmovnu %stN */ 75.723 + emulate_fpu_insn_stub(0xdb, modrm); 75.724 + break; 75.725 + case 0xe2: /* fnclex */ 75.726 + emulate_fpu_insn("fnclex"); 75.727 + break; 75.728 + case 0xe3: /* fninit */ 75.729 + emulate_fpu_insn("fninit"); 75.730 + break; 75.731 + case 0xe4: /* fsetpm - 287 only, ignored by 387 */ 75.732 + break; 75.733 + case 0xe8 ... 0xef: /* fucomi %stN */ 75.734 + case 0xf0 ... 0xf7: /* fcomi %stN */ 75.735 + emulate_fpu_insn_stub(0xdb, modrm); 75.736 + break; 75.737 + default: 75.738 + fail_if(modrm >= 0xc0); 75.739 + switch ( modrm_reg & 7 ) 75.740 + { 75.741 + case 0: /* fild m32i */ 75.742 + ea.bytes = 4; 75.743 + src = ea; 75.744 + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, 75.745 + src.bytes, ctxt)) != 0 ) 75.746 + goto done; 75.747 + emulate_fpu_insn_memsrc("fildl", src.val); 75.748 + break; 75.749 + case 1: /* fisttp m32i */ 75.750 + ea.bytes = 4; 75.751 + dst = ea; 75.752 + dst.type = OP_MEM; 75.753 + emulate_fpu_insn_memdst("fisttpl", dst.val); 75.754 + break; 75.755 + case 2: /* fist m32i */ 75.756 + ea.bytes = 4; 75.757 + dst = ea; 75.758 + dst.type = OP_MEM; 75.759 + emulate_fpu_insn_memdst("fistl", dst.val); 75.760 + break; 75.761 + case 3: /* fistp m32i */ 75.762 + ea.bytes = 4; 75.763 + dst = ea; 75.764 + dst.type = OP_MEM; 75.765 + emulate_fpu_insn_memdst("fistpl", dst.val); 75.766 + break; 75.767 + case 5: /* fld m80fp */ 75.768 + ea.bytes = 10; 75.769 + src = ea; 75.770 + if ( (rc = ops->read(src.mem.seg, src.mem.off, 75.771 + &src.val, src.bytes, ctxt)) != 0 ) 75.772 + goto done; 75.773 + emulate_fpu_insn_memdst("fldt", src.val); 75.774 + break; 75.775 + case 7: /* fstp m80fp */ 75.776 + ea.bytes = 10; 75.777 + dst.type = OP_MEM; 75.778 + dst = ea; 75.779 + emulate_fpu_insn_memdst("fstpt", dst.val); 75.780 + break; 75.781 + default: 75.782 + goto cannot_emulate; 75.783 + } 75.784 + } 75.785 + break; 75.786 + 75.787 + case 0xdc: /* FPU 0xdc */ 75.788 + switch ( modrm ) 75.789 + { 75.790 + case 0xc0 ... 0xc7: /* fadd %stN */ 75.791 + case 0xc8 ... 0xcf: /* fmul %stN */ 75.792 + case 0xe0 ... 0xe7: /* fsubr %stN */ 75.793 + case 0xe8 ... 0xef: /* fsub %stN */ 75.794 + case 0xf0 ... 0xf7: /* fdivr %stN */ 75.795 + case 0xf8 ... 0xff: /* fdiv %stN */ 75.796 + emulate_fpu_insn_stub(0xdc, modrm); 75.797 + break; 75.798 + default: 75.799 + fail_if(modrm >= 0xc0); 75.800 + ea.bytes = 8; 75.801 + src = ea; 75.802 + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, 75.803 + src.bytes, ctxt)) != 0 ) 75.804 + goto done; 75.805 + switch ( modrm_reg & 7 ) 75.806 + { 75.807 + case 0: /* fadd m64fp */ 75.808 + emulate_fpu_insn_memsrc("faddl", src.val); 75.809 + break; 75.810 + case 1: /* fmul m64fp */ 75.811 + emulate_fpu_insn_memsrc("fmull", src.val); 75.812 + break; 75.813 + case 2: /* fcom m64fp */ 75.814 + emulate_fpu_insn_memsrc("fcoml", src.val); 75.815 + break; 75.816 + case 3: /* fcomp m64fp */ 75.817 + emulate_fpu_insn_memsrc("fcompl", src.val); 75.818 + break; 75.819 + case 4: /* fsub m64fp */ 75.820 + emulate_fpu_insn_memsrc("fsubl", src.val); 75.821 + break; 75.822 + case 5: /* fsubr m64fp */ 75.823 + emulate_fpu_insn_memsrc("fsubrl", src.val); 75.824 + break; 75.825 + case 6: /* fdiv m64fp */ 75.826 + emulate_fpu_insn_memsrc("fdivl", src.val); 75.827 + break; 75.828 + case 7: /* fdivr m64fp */ 75.829 + emulate_fpu_insn_memsrc("fdivrl", src.val); 75.830 + break; 75.831 + } 75.832 + } 75.833 break; 75.834 75.835 case 0xdd: /* FPU 0xdd */ 75.836 - fail_if((modrm_reg & 7) != 7); 75.837 - fail_if(modrm >= 0xc0); 75.838 - /* fnstsw m2byte */ 75.839 - ea.bytes = 2; 75.840 - dst = ea; 75.841 - emulate_fpu_insn_memdst("fnstsw", dst.val); 75.842 + switch ( modrm ) 75.843 + { 75.844 + case 0xc0 ... 0xc7: /* ffree %stN */ 75.845 + case 0xd0 ... 0xd7: /* fst %stN */ 75.846 + case 0xd8 ... 0xdf: /* fstp %stN */ 75.847 + case 0xe0 ... 0xe7: /* fucom %stN */ 75.848 + case 0xe8 ... 0xef: /* fucomp %stN */ 75.849 + emulate_fpu_insn_stub(0xdd, modrm); 75.850 + break; 75.851 + default: 75.852 + fail_if(modrm >= 0xc0); 75.853 + switch ( modrm_reg & 7 ) 75.854 + { 75.855 + case 0: /* fld m64fp */; 75.856 + ea.bytes = 8; 75.857 + src = ea; 75.858 + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, 75.859 + src.bytes, ctxt)) != 0 ) 75.860 + goto done; 75.861 + emulate_fpu_insn_memsrc("fldl", src.val); 75.862 + break; 75.863 + case 1: /* fisttp m64i */ 75.864 + ea.bytes = 8; 75.865 + dst = ea; 75.866 + dst.type = OP_MEM; 75.867 + emulate_fpu_insn_memdst("fisttpll", dst.val); 75.868 + break; 75.869 + case 2: /* fst m64fp */ 75.870 + ea.bytes = 8; 75.871 + dst = ea; 75.872 + dst.type = OP_MEM; 75.873 + emulate_fpu_insn_memsrc("fstl", dst.val); 75.874 + break; 75.875 + case 3: /* fstp m64fp */ 75.876 + ea.bytes = 8; 75.877 + dst = ea; 75.878 + dst.type = OP_MEM; 75.879 + emulate_fpu_insn_memdst("fstpl", dst.val); 75.880 + break; 75.881 + case 7: /* fnstsw m2byte */ 75.882 + ea.bytes = 2; 75.883 + dst = ea; 75.884 + dst.type = OP_MEM; 75.885 + emulate_fpu_insn_memdst("fnstsw", dst.val); 75.886 + break; 75.887 + default: 75.888 + goto cannot_emulate; 75.889 + } 75.890 + } 75.891 break; 75.892 75.893 case 0xde: /* FPU 0xde */ 75.894 @@ -2859,17 +3194,120 @@ x86_emulate( 75.895 emulate_fpu_insn_stub(0xde, modrm); 75.896 break; 75.897 default: 75.898 - goto cannot_emulate; 75.899 + fail_if(modrm >= 0xc0); 75.900 + ea.bytes = 2; 75.901 + src = ea; 75.902 + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, 75.903 + src.bytes, ctxt)) != 0 ) 75.904 + goto done; 75.905 + switch ( modrm_reg & 7 ) 75.906 + { 75.907 + case 0: /* fiadd m16i */ 75.908 + emulate_fpu_insn_memsrc("fiadd", src.val); 75.909 + break; 75.910 + case 1: /* fimul m16i */ 75.911 + emulate_fpu_insn_memsrc("fimul", src.val); 75.912 + break; 75.913 + case 2: /* ficom m16i */ 75.914 + emulate_fpu_insn_memsrc("ficom", src.val); 75.915 + break; 75.916 + case 3: /* ficomp m16i */ 75.917 + emulate_fpu_insn_memsrc("ficomp", src.val); 75.918 + break; 75.919 + case 4: /* fisub m16i */ 75.920 + emulate_fpu_insn_memsrc("fisub", src.val); 75.921 + break; 75.922 + case 5: /* fisubr m16i */ 75.923 + emulate_fpu_insn_memsrc("fisubr", src.val); 75.924 + break; 75.925 + case 6: /* fidiv m16i */ 75.926 + emulate_fpu_insn_memsrc("fidiv", src.val); 75.927 + break; 75.928 + case 7: /* fidivr m16i */ 75.929 + emulate_fpu_insn_memsrc("fidivr", src.val); 75.930 + break; 75.931 + default: 75.932 + goto cannot_emulate; 75.933 + } 75.934 } 75.935 break; 75.936 75.937 case 0xdf: /* FPU 0xdf */ 75.938 - fail_if(modrm != 0xe0); 75.939 - /* fnstsw %ax */ 75.940 - dst.bytes = 2; 75.941 - dst.type = OP_REG; 75.942 - dst.reg = (unsigned long *)&_regs.eax; 75.943 - emulate_fpu_insn_memdst("fnstsw", dst.val); 75.944 + switch ( modrm ) 75.945 + { 75.946 + case 0xe0: 75.947 + /* fnstsw %ax */ 75.948 + dst.bytes = 2; 75.949 + dst.type = OP_REG; 75.950 + dst.reg = (unsigned long *)&_regs.eax; 75.951 + emulate_fpu_insn_memdst("fnstsw", dst.val); 75.952 + break; 75.953 + case 0xf0 ... 0xf7: /* fcomip %stN */ 75.954 + case 0xf8 ... 0xff: /* fucomip %stN */ 75.955 + emulate_fpu_insn_stub(0xdf, modrm); 75.956 + break; 75.957 + default: 75.958 + fail_if(modrm >= 0xc0); 75.959 + switch ( modrm_reg & 7 ) 75.960 + { 75.961 + case 0: /* fild m16i */ 75.962 + ea.bytes = 2; 75.963 + src = ea; 75.964 + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, 75.965 + src.bytes, ctxt)) != 0 ) 75.966 + goto done; 75.967 + emulate_fpu_insn_memsrc("fild", src.val); 75.968 + break; 75.969 + case 1: /* fisttp m16i */ 75.970 + ea.bytes = 2; 75.971 + dst = ea; 75.972 + dst.type = OP_MEM; 75.973 + emulate_fpu_insn_memdst("fisttp", dst.val); 75.974 + break; 75.975 + case 2: /* fist m16i */ 75.976 + ea.bytes = 2; 75.977 + dst = ea; 75.978 + dst.type = OP_MEM; 75.979 + emulate_fpu_insn_memdst("fist", dst.val); 75.980 + break; 75.981 + case 3: /* fistp m16i */ 75.982 + ea.bytes = 2; 75.983 + dst = ea; 75.984 + dst.type = OP_MEM; 75.985 + emulate_fpu_insn_memdst("fistp", dst.val); 75.986 + break; 75.987 + case 4: /* fbld m80dec */ 75.988 + ea.bytes = 10; 75.989 + dst = ea; 75.990 + if ( (rc = ops->read(src.mem.seg, src.mem.off, 75.991 + &src.val, src.bytes, ctxt)) != 0 ) 75.992 + goto done; 75.993 + emulate_fpu_insn_memdst("fbld", src.val); 75.994 + break; 75.995 + case 5: /* fild m64i */ 75.996 + ea.bytes = 8; 75.997 + src = ea; 75.998 + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, 75.999 + src.bytes, ctxt)) != 0 ) 75.1000 + goto done; 75.1001 + emulate_fpu_insn_memsrc("fildll", src.val); 75.1002 + break; 75.1003 + case 6: /* fbstp packed bcd */ 75.1004 + ea.bytes = 10; 75.1005 + dst = ea; 75.1006 + dst.type = OP_MEM; 75.1007 + emulate_fpu_insn_memdst("fbstp", dst.val); 75.1008 + break; 75.1009 + case 7: /* fistp m64i */ 75.1010 + ea.bytes = 8; 75.1011 + dst = ea; 75.1012 + dst.type = OP_MEM; 75.1013 + emulate_fpu_insn_memdst("fistpll", dst.val); 75.1014 + break; 75.1015 + default: 75.1016 + goto cannot_emulate; 75.1017 + } 75.1018 + } 75.1019 break; 75.1020 75.1021 case 0xe0 ... 0xe2: /* loop{,z,nz} */ { 75.1022 @@ -2924,7 +3362,6 @@ x86_emulate( 75.1023 /* out */ 75.1024 fail_if(ops->write_io == NULL); 75.1025 rc = ops->write_io(port, op_bytes, _regs.eax, ctxt); 75.1026 - 75.1027 } 75.1028 else 75.1029 { 75.1030 @@ -3242,9 +3679,9 @@ x86_emulate( 75.1031 if ( op_bytes == 2 ) 75.1032 reg.base &= 0xffffff; 75.1033 if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, 75.1034 - reg.limit, 2, ctxt)) || 75.1035 + ®.limit, 2, ctxt)) || 75.1036 (rc = ops->write(ea.mem.seg, ea.mem.off+2, 75.1037 - reg.base, mode_64bit() ? 8 : 4, ctxt)) ) 75.1038 + ®.base, mode_64bit() ? 8 : 4, ctxt)) ) 75.1039 goto done; 75.1040 break; 75.1041 case 2: /* lgdt */ 75.1042 @@ -3252,10 +3689,10 @@ x86_emulate( 75.1043 generate_exception_if(ea.type != OP_MEM, EXC_UD, -1); 75.1044 fail_if(ops->write_segment == NULL); 75.1045 memset(®, 0, sizeof(reg)); 75.1046 - if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, 75.1047 - &limit, 2, ctxt)) || 75.1048 - (rc = ops->read(ea.mem.seg, ea.mem.off+2, 75.1049 - &base, mode_64bit() ? 8 : 4, ctxt)) ) 75.1050 + if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0, 75.1051 + &limit, 2, ctxt, ops)) || 75.1052 + (rc = read_ulong(ea.mem.seg, ea.mem.off+2, 75.1053 + &base, mode_64bit() ? 8 : 4, ctxt, ops)) ) 75.1054 goto done; 75.1055 reg.base = base; 75.1056 reg.limit = limit; 75.1057 @@ -3267,7 +3704,8 @@ x86_emulate( 75.1058 goto done; 75.1059 break; 75.1060 case 4: /* smsw */ 75.1061 - ea.bytes = 2; 75.1062 + if ( ea.type == OP_MEM ) 75.1063 + ea.bytes = 2; 75.1064 dst = ea; 75.1065 fail_if(ops->read_cr == NULL); 75.1066 if ( (rc = ops->read_cr(0, &dst.val, ctxt)) ) 75.1067 @@ -3281,11 +3719,11 @@ x86_emulate( 75.1068 goto done; 75.1069 if ( ea.type == OP_REG ) 75.1070 cr0w = *ea.reg; 75.1071 - else if ( (rc = ops->read(ea.mem.seg, ea.mem.off, 75.1072 - &cr0w, 2, ctxt)) ) 75.1073 + else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, 75.1074 + &cr0w, 2, ctxt, ops)) ) 75.1075 goto done; 75.1076 - cr0 &= 0xffff0000; 75.1077 - cr0 |= (uint16_t)cr0w; 75.1078 + /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */ 75.1079 + cr0 = (cr0 & ~0xe) | (cr0w & 0xf); 75.1080 if ( (rc = ops->write_cr(0, cr0, ctxt)) ) 75.1081 goto done; 75.1082 break; 75.1083 @@ -3404,8 +3842,10 @@ x86_emulate( 75.1084 if ( ea.type == OP_MEM ) 75.1085 { 75.1086 unsigned long lval, hval; 75.1087 - if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) || 75.1088 - (rc = ops->read(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) ) 75.1089 + if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0, 75.1090 + &lval, 4, ctxt, ops)) || 75.1091 + (rc = read_ulong(ea.mem.seg, ea.mem.off+4, 75.1092 + &hval, 4, ctxt, ops)) ) 75.1093 goto done; 75.1094 val = ((uint64_t)hval << 32) | (uint32_t)lval; 75.1095 stub[2] = modrm & 0x38; /* movq (%eax),%mmN */ 75.1096 @@ -3428,8 +3868,8 @@ x86_emulate( 75.1097 if ( ea.type == OP_MEM ) 75.1098 { 75.1099 unsigned long lval = (uint32_t)val, hval = (uint32_t)(val >> 32); 75.1100 - if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, lval, 4, ctxt)) || 75.1101 - (rc = ops->write(ea.mem.seg, ea.mem.off+4, hval, 4, ctxt)) ) 75.1102 + if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) || 75.1103 + (rc = ops->write(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) ) 75.1104 goto done; 75.1105 } 75.1106 break; 75.1107 @@ -3481,8 +3921,8 @@ x86_emulate( 75.1108 75.1109 /* Get actual old value. */ 75.1110 for ( i = 0; i < (op_bytes/sizeof(long)); i++ ) 75.1111 - if ( (rc = ops->read(ea.mem.seg, ea.mem.off + i*sizeof(long), 75.1112 - &old[i], sizeof(long), ctxt)) != 0 ) 75.1113 + if ( (rc = read_ulong(ea.mem.seg, ea.mem.off + i*sizeof(long), 75.1114 + &old[i], sizeof(long), ctxt, ops)) != 0 ) 75.1115 goto done; 75.1116 75.1117 /* Get expected and proposed values. */
76.1 --- a/xen/arch/x86/x86_emulate/x86_emulate.h Thu Jun 19 12:48:04 2008 +0900 76.2 +++ b/xen/arch/x86/x86_emulate/x86_emulate.h Wed Jul 02 11:30:37 2008 +0900 76.3 @@ -102,7 +102,8 @@ enum x86_emulate_fpu_type { 76.4 }; 76.5 76.6 /* 76.7 - * These operations represent the instruction emulator's interface to memory. 76.8 + * These operations represent the instruction emulator's interface to memory, 76.9 + * I/O ports, privileged state... pretty much everything other than GPRs. 76.10 * 76.11 * NOTES: 76.12 * 1. If the access fails (cannot emulate, or a standard access faults) then 76.13 @@ -110,8 +111,7 @@ enum x86_emulate_fpu_type { 76.14 * some out-of-band mechanism, unknown to the emulator. The memop signals 76.15 * failure by returning X86EMUL_EXCEPTION to the emulator, which will 76.16 * then immediately bail. 76.17 - * 2. Valid access sizes are 1, 2, 4 and 8 (x86/64 only) bytes. 76.18 - * 3. The emulator cannot handle 64-bit mode emulation on an x86/32 system. 76.19 + * 2. The emulator cannot handle 64-bit mode emulation on an x86/32 system. 76.20 */ 76.21 struct x86_emulate_ops 76.22 { 76.23 @@ -121,19 +121,25 @@ struct x86_emulate_ops 76.24 * All memory-access functions: 76.25 * @seg: [IN ] Segment being dereferenced (specified as x86_seg_??). 76.26 * @offset:[IN ] Offset within segment. 76.27 + * @p_data:[IN ] Pointer to i/o data buffer (length is @bytes) 76.28 * Read functions: 76.29 * @val: [OUT] Value read, zero-extended to 'ulong'. 76.30 * Write functions: 76.31 * @val: [IN ] Value to write (low-order bytes used as req'd). 76.32 * Variable-length access functions: 76.33 - * @bytes: [IN ] Number of bytes to read or write. 76.34 + * @bytes: [IN ] Number of bytes to read or write. Valid access sizes are 76.35 + * 1, 2, 4 and 8 (x86/64 only) bytes, unless otherwise 76.36 + * stated. 76.37 */ 76.38 76.39 - /* read: Emulate a memory read. */ 76.40 + /* 76.41 + * read: Emulate a memory read. 76.42 + * @bytes: Access length (0 < @bytes < 4096). 76.43 + */ 76.44 int (*read)( 76.45 enum x86_segment seg, 76.46 unsigned long offset, 76.47 - unsigned long *val, 76.48 + void *p_data, 76.49 unsigned int bytes, 76.50 struct x86_emulate_ctxt *ctxt); 76.51 76.52 @@ -144,15 +150,18 @@ struct x86_emulate_ops 76.53 int (*insn_fetch)( 76.54 enum x86_segment seg, 76.55 unsigned long offset, 76.56 - unsigned long *val, 76.57 + void *p_data, 76.58 unsigned int bytes, 76.59 struct x86_emulate_ctxt *ctxt); 76.60 76.61 - /* write: Emulate a memory write. */ 76.62 + /* 76.63 + * write: Emulate a memory write. 76.64 + * @bytes: Access length (0 < @bytes < 4096). 76.65 + */ 76.66 int (*write)( 76.67 enum x86_segment seg, 76.68 unsigned long offset, 76.69 - unsigned long val, 76.70 + void *p_data, 76.71 unsigned int bytes, 76.72 struct x86_emulate_ctxt *ctxt); 76.73
77.1 --- a/xen/common/domain.c Thu Jun 19 12:48:04 2008 +0900 77.2 +++ b/xen/common/domain.c Wed Jul 02 11:30:37 2008 +0900 77.3 @@ -73,36 +73,13 @@ int current_domain_id(void) 77.4 return current->domain->domain_id; 77.5 } 77.6 77.7 -struct domain *alloc_domain(domid_t domid) 77.8 +static struct domain *alloc_domain_struct(void) 77.9 { 77.10 - struct domain *d; 77.11 - 77.12 - if ( (d = xmalloc(struct domain)) == NULL ) 77.13 - return NULL; 77.14 - 77.15 - memset(d, 0, sizeof(*d)); 77.16 - d->domain_id = domid; 77.17 - 77.18 - if ( xsm_alloc_security_domain(d) != 0 ) 77.19 - { 77.20 - free_domain(d); 77.21 - return NULL; 77.22 - } 77.23 - 77.24 - atomic_set(&d->refcnt, 1); 77.25 - spin_lock_init(&d->domain_lock); 77.26 - spin_lock_init(&d->page_alloc_lock); 77.27 - spin_lock_init(&d->shutdown_lock); 77.28 - spin_lock_init(&d->hypercall_deadlock_mutex); 77.29 - INIT_LIST_HEAD(&d->page_list); 77.30 - INIT_LIST_HEAD(&d->xenpage_list); 77.31 - 77.32 - return d; 77.33 + return xmalloc(struct domain); 77.34 } 77.35 77.36 -void free_domain(struct domain *d) 77.37 +static void free_domain_struct(struct domain *d) 77.38 { 77.39 - xsm_free_security_domain(d); 77.40 xfree(d); 77.41 } 77.42 77.43 @@ -210,19 +187,39 @@ struct domain *domain_create( 77.44 domid_t domid, unsigned int domcr_flags, ssidref_t ssidref) 77.45 { 77.46 struct domain *d, **pd; 77.47 - enum { INIT_evtchn = 1, INIT_gnttab = 2, INIT_arch = 8 }; 77.48 + enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2, 77.49 + INIT_gnttab = 1u<<3, INIT_arch = 1u<<4 }; 77.50 int init_status = 0; 77.51 77.52 - if ( (d = alloc_domain(domid)) == NULL ) 77.53 + if ( (d = alloc_domain_struct()) == NULL ) 77.54 return NULL; 77.55 77.56 + memset(d, 0, sizeof(*d)); 77.57 + d->domain_id = domid; 77.58 + 77.59 + if ( xsm_alloc_security_domain(d) != 0 ) 77.60 + goto fail; 77.61 + init_status |= INIT_xsm; 77.62 + 77.63 + atomic_set(&d->refcnt, 1); 77.64 + spin_lock_init(&d->domain_lock); 77.65 + spin_lock_init(&d->page_alloc_lock); 77.66 + spin_lock_init(&d->shutdown_lock); 77.67 + spin_lock_init(&d->hypercall_deadlock_mutex); 77.68 + INIT_LIST_HEAD(&d->page_list); 77.69 + INIT_LIST_HEAD(&d->xenpage_list); 77.70 + 77.71 if ( domcr_flags & DOMCRF_hvm ) 77.72 d->is_hvm = 1; 77.73 77.74 if ( (domid == 0) && opt_dom0_vcpus_pin ) 77.75 d->is_pinned = 1; 77.76 77.77 + if ( domcr_flags & DOMCRF_dummy ) 77.78 + return d; 77.79 + 77.80 rangeset_domain_initialise(d); 77.81 + init_status |= INIT_rangeset; 77.82 77.83 if ( !is_idle_domain(d) ) 77.84 { 77.85 @@ -278,8 +275,11 @@ struct domain *domain_create( 77.86 grant_table_destroy(d); 77.87 if ( init_status & INIT_evtchn ) 77.88 evtchn_destroy(d); 77.89 - rangeset_domain_destroy(d); 77.90 - free_domain(d); 77.91 + if ( init_status & INIT_rangeset ) 77.92 + rangeset_domain_destroy(d); 77.93 + if ( init_status & INIT_xsm ) 77.94 + xsm_free_security_domain(d); 77.95 + free_domain_struct(d); 77.96 return NULL; 77.97 } 77.98 77.99 @@ -535,7 +535,8 @@ static void complete_domain_destroy(stru 77.100 if ( d->target != NULL ) 77.101 put_domain(d->target); 77.102 77.103 - free_domain(d); 77.104 + xsm_free_security_domain(d); 77.105 + free_domain_struct(d); 77.106 77.107 send_guest_global_virq(dom0, VIRQ_DOM_EXC); 77.108 }
78.1 --- a/xen/drivers/passthrough/vtd/dmar.c Thu Jun 19 12:48:04 2008 +0900 78.2 +++ b/xen/drivers/passthrough/vtd/dmar.c Wed Jul 02 11:30:37 2008 +0900 78.3 @@ -383,7 +383,8 @@ acpi_parse_one_drhd(struct acpi_dmar_ent 78.4 dmaru->address = drhd->address; 78.5 dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */ 78.6 INIT_LIST_HEAD(&dmaru->ioapic_list); 78.7 - dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %lx\n", dmaru->address); 78.8 + dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n", 78.9 + dmaru->address); 78.10 78.11 dev_scope_start = (void *)(drhd + 1); 78.12 dev_scope_end = ((void *)drhd) + header->length;
79.1 --- a/xen/drivers/passthrough/vtd/dmar.h Thu Jun 19 12:48:04 2008 +0900 79.2 +++ b/xen/drivers/passthrough/vtd/dmar.h Wed Jul 02 11:30:37 2008 +0900 79.3 @@ -42,28 +42,28 @@ struct acpi_ioapic_unit { 79.4 79.5 struct acpi_drhd_unit { 79.6 struct list_head list; 79.7 - unsigned long address; /* register base address of the unit */ 79.8 - struct pci_dev *devices; /* target devices */ 79.9 + u64 address; /* register base address of the unit */ 79.10 + struct pci_dev *devices; /* target devices */ 79.11 int devices_cnt; 79.12 - u8 include_all:1; 79.13 + u8 include_all:1; 79.14 struct iommu *iommu; 79.15 struct list_head ioapic_list; 79.16 }; 79.17 79.18 struct acpi_rmrr_unit { 79.19 struct list_head list; 79.20 - unsigned long base_address; 79.21 - unsigned long end_address; 79.22 + u64 base_address; 79.23 + u64 end_address; 79.24 struct pci_dev *devices; /* target devices */ 79.25 int devices_cnt; 79.26 - u8 allow_all:1; 79.27 + u8 allow_all:1; 79.28 }; 79.29 79.30 struct acpi_atsr_unit { 79.31 struct list_head list; 79.32 - struct pci_dev *devices; /* target devices */ 79.33 + struct pci_dev *devices; /* target devices */ 79.34 int devices_cnt; 79.35 - u8 all_ports:1; 79.36 + u8 all_ports:1; 79.37 }; 79.38 79.39 #define for_each_iommu(domain, iommu) \
80.1 --- a/xen/drivers/passthrough/vtd/intremap.c Thu Jun 19 12:48:04 2008 +0900 80.2 +++ b/xen/drivers/passthrough/vtd/intremap.c Wed Jul 02 11:30:37 2008 +0900 80.3 @@ -52,7 +52,7 @@ static void remap_entry_to_ioapic_rte( 80.4 unsigned long flags; 80.5 struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); 80.6 80.7 - if ( ir_ctrl == NULL || ir_ctrl->iremap_index < 0 ) 80.8 + if ( ir_ctrl == NULL ) 80.9 { 80.10 dprintk(XENLOG_ERR VTDPREFIX, 80.11 "remap_entry_to_ioapic_rte: ir_ctl is not ready\n"); 80.12 @@ -153,6 +153,7 @@ static void ioapic_rte_to_remap_entry(st 80.13 } 80.14 80.15 memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); 80.16 + iommu_flush_cache_entry(iremap_entry); 80.17 iommu_flush_iec_index(iommu, 0, index); 80.18 invalidate_sync(iommu); 80.19 80.20 @@ -170,7 +171,8 @@ unsigned int io_apic_read_remap_rte( 80.21 struct iommu *iommu = ioapic_to_iommu(mp_ioapics[apic].mpc_apicid); 80.22 struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); 80.23 80.24 - if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ) 80.25 + if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 || 80.26 + ir_ctrl->iremap_index == -1 ) 80.27 { 80.28 *IO_APIC_BASE(apic) = reg; 80.29 return *(IO_APIC_BASE(apic)+4); 80.30 @@ -377,6 +379,7 @@ static void msi_msg_to_remap_entry( 80.31 remap_rte->data = 0; 80.32 80.33 memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); 80.34 + iommu_flush_cache_entry(iremap_entry); 80.35 iommu_flush_iec_index(iommu, 0, index); 80.36 invalidate_sync(iommu); 80.37
81.1 --- a/xen/drivers/passthrough/vtd/iommu.c Thu Jun 19 12:48:04 2008 +0900 81.2 +++ b/xen/drivers/passthrough/vtd/iommu.c Wed Jul 02 11:30:37 2008 +0900 81.3 @@ -1269,7 +1269,6 @@ static int domain_context_mapping( 81.4 } 81.5 81.6 static int domain_context_unmap_one( 81.7 - struct domain *domain, 81.8 struct iommu *iommu, 81.9 u8 bus, u8 devfn) 81.10 { 81.11 @@ -1300,7 +1299,6 @@ static int domain_context_unmap_one( 81.12 } 81.13 81.14 static int domain_context_unmap( 81.15 - struct domain *domain, 81.16 struct iommu *iommu, 81.17 struct pci_dev *pdev) 81.18 { 81.19 @@ -1320,14 +1318,13 @@ static int domain_context_unmap( 81.20 PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS); 81.21 break; 81.22 case DEV_TYPE_PCIe_ENDPOINT: 81.23 - ret = domain_context_unmap_one(domain, iommu, 81.24 + ret = domain_context_unmap_one(iommu, 81.25 (u8)(pdev->bus), (u8)(pdev->devfn)); 81.26 break; 81.27 case DEV_TYPE_PCI: 81.28 if ( pdev->bus == 0 ) 81.29 ret = domain_context_unmap_one( 81.30 - domain, iommu, 81.31 - (u8)(pdev->bus), (u8)(pdev->devfn)); 81.32 + iommu, (u8)(pdev->bus), (u8)(pdev->devfn)); 81.33 else 81.34 { 81.35 if ( bus2bridge[pdev->bus].bus != 0 ) 81.36 @@ -1335,7 +1332,7 @@ static int domain_context_unmap( 81.37 "domain_context_unmap:" 81.38 "bus2bridge[%d].bus != 0\n", pdev->bus); 81.39 81.40 - ret = domain_context_unmap_one(domain, iommu, 81.41 + ret = domain_context_unmap_one(iommu, 81.42 (u8)(bus2bridge[pdev->bus].bus), 81.43 (u8)(bus2bridge[pdev->bus].devfn)); 81.44 81.45 @@ -1345,8 +1342,7 @@ static int domain_context_unmap( 81.46 for ( func = 0; func < 8; func++ ) 81.47 { 81.48 ret = domain_context_unmap_one( 81.49 - domain, iommu, 81.50 - pdev->bus, (u8)PCI_DEVFN(dev, func)); 81.51 + iommu, pdev->bus, (u8)PCI_DEVFN(dev, func)); 81.52 if ( ret ) 81.53 return ret; 81.54 } 81.55 @@ -1389,7 +1385,7 @@ void reassign_device_ownership( 81.56 found: 81.57 drhd = acpi_find_matched_drhd_unit(pdev); 81.58 iommu = drhd->iommu; 81.59 - domain_context_unmap(source, iommu, pdev); 81.60 + domain_context_unmap(iommu, pdev); 81.61 81.62 /* Move pci device from the source domain to target domain. */ 81.63 spin_lock_irqsave(&source_hd->iommu_list_lock, flags); 81.64 @@ -1589,7 +1585,7 @@ static int iommu_prepare_rmrr_dev( 81.65 struct pci_dev *pdev) 81.66 { 81.67 struct acpi_drhd_unit *drhd; 81.68 - unsigned long size; 81.69 + u64 size; 81.70 int ret; 81.71 81.72 /* page table init */
82.1 --- a/xen/drivers/passthrough/vtd/qinval.c Thu Jun 19 12:48:04 2008 +0900 82.2 +++ b/xen/drivers/passthrough/vtd/qinval.c Wed Jul 02 11:30:37 2008 +0900 82.3 @@ -222,7 +222,7 @@ int invalidate_sync(struct iommu *iommu) 82.4 int ret = -1; 82.5 struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); 82.6 82.7 - if ( qi_ctrl->qinval_maddr == 0 ) 82.8 + if ( qi_ctrl->qinval_maddr != 0 ) 82.9 { 82.10 ret = queue_invalidate_wait(iommu, 82.11 0, 1, 1, 1, &qi_ctrl->qinval_poll_status); 82.12 @@ -416,7 +416,6 @@ static int flush_iotlb_qi( 82.13 int qinval_setup(struct iommu *iommu) 82.14 { 82.15 s_time_t start_time; 82.16 - u32 status = 0; 82.17 struct qi_ctrl *qi_ctrl; 82.18 struct iommu_flush *flush; 82.19 82.20 @@ -450,15 +449,12 @@ int qinval_setup(struct iommu *iommu) 82.21 82.22 /* Make sure hardware complete it */ 82.23 start_time = NOW(); 82.24 - for ( ; ; ) 82.25 + while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_QIES) ) 82.26 { 82.27 - status = dmar_readl(iommu->reg, DMAR_GSTS_REG); 82.28 - if ( status & DMA_GSTS_QIES ) 82.29 - break; 82.30 if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) ) 82.31 panic("Cannot set QIE field for queue invalidation\n"); 82.32 cpu_relax(); 82.33 } 82.34 - status = 0; 82.35 - return status; 82.36 + 82.37 + return 0; 82.38 }
83.1 --- a/xen/drivers/passthrough/vtd/utils.c Thu Jun 19 12:48:04 2008 +0900 83.2 +++ b/xen/drivers/passthrough/vtd/utils.c Wed Jul 02 11:30:37 2008 +0900 83.3 @@ -166,7 +166,7 @@ void print_iommu_regs(struct acpi_drhd_u 83.4 struct iommu *iommu = drhd->iommu; 83.5 83.6 printk("---- print_iommu_regs ----\n"); 83.7 - printk("print_iommu_regs: drhd->address = %lx\n", drhd->address); 83.8 + printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address); 83.9 printk("print_iommu_regs: DMAR_VER_REG = %x\n", 83.10 dmar_readl(iommu->reg,DMAR_VER_REG)); 83.11 printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n",
84.1 --- a/xen/include/acpi/cpufreq/cpufreq.h Thu Jun 19 12:48:04 2008 +0900 84.2 +++ b/xen/include/acpi/cpufreq/cpufreq.h Wed Jul 02 11:30:37 2008 +0900 84.3 @@ -36,7 +36,10 @@ struct cpufreq_policy { 84.4 unsigned int max; /* in kHz */ 84.5 unsigned int cur; /* in kHz, only needed if cpufreq 84.6 * governors are used */ 84.7 + unsigned int resume; /* flag for cpufreq 1st run 84.8 + * S3 wakeup, hotplug cpu, etc */ 84.9 }; 84.10 +extern struct cpufreq_policy xen_px_policy[NR_CPUS]; 84.11 84.12 #define CPUFREQ_SHARED_TYPE_NONE (0) /* None */ 84.13 #define CPUFREQ_SHARED_TYPE_HW (1) /* HW does needed coordination */
85.1 --- a/xen/include/acpi/cpufreq/processor_perf.h Thu Jun 19 12:48:04 2008 +0900 85.2 +++ b/xen/include/acpi/cpufreq/processor_perf.h Wed Jul 02 11:30:37 2008 +0900 85.3 @@ -6,9 +6,21 @@ 85.4 85.5 int get_cpu_id(u8); 85.6 int acpi_cpufreq_init(void); 85.7 +int powernow_cpufreq_init(void); 85.8 + 85.9 void px_statistic_update(cpumask_t, uint8_t, uint8_t); 85.10 int px_statistic_init(int); 85.11 void px_statistic_reset(int); 85.12 +void px_statistic_suspend(void); 85.13 +void px_statistic_resume(void); 85.14 + 85.15 +void cpufreq_dom_exit(void); 85.16 +int cpufreq_dom_init(void); 85.17 +int cpufreq_dom_dbs(unsigned int); 85.18 +void cpufreq_suspend(void); 85.19 +int cpufreq_resume(void); 85.20 + 85.21 +inline uint64_t get_cpu_idle_time(unsigned int); 85.22 85.23 struct processor_performance { 85.24 uint32_t state; 85.25 @@ -44,6 +56,7 @@ struct px_stat { 85.26 struct pm_px { 85.27 struct px_stat u; 85.28 uint64_t prev_state_wall; 85.29 + uint64_t prev_idle_wall; 85.30 }; 85.31 85.32 extern struct pm_px px_statistic_data[NR_CPUS];
86.1 --- a/xen/include/asm-x86/domain.h Thu Jun 19 12:48:04 2008 +0900 86.2 +++ b/xen/include/asm-x86/domain.h Wed Jul 02 11:30:37 2008 +0900 86.3 @@ -103,6 +103,9 @@ struct shadow_domain { 86.4 * emulation and remove write permission 86.5 */ 86.6 atomic_t gtable_dirty_version; 86.7 + 86.8 + /* OOS */ 86.9 + int oos_active; 86.10 }; 86.11 86.12 struct shadow_vcpu { 86.13 @@ -122,6 +125,17 @@ struct shadow_vcpu { 86.14 unsigned long last_emulated_frame; 86.15 /* Last MFN that we emulated a write successfully */ 86.16 unsigned long last_emulated_mfn; 86.17 + 86.18 + /* Shadow out-of-sync: pages that this vcpu has let go out of sync */ 86.19 + mfn_t oos[SHADOW_OOS_PAGES]; 86.20 + unsigned long oos_va[SHADOW_OOS_PAGES]; 86.21 + mfn_t oos_snapshot[SHADOW_OOS_PAGES]; 86.22 + struct oos_fixup { 86.23 + mfn_t gmfn; 86.24 + mfn_t smfn; 86.25 + unsigned long off; 86.26 + } *oos_fixups; 86.27 + int oos_fixup_used; 86.28 }; 86.29 86.30 /************************************************/
87.1 --- a/xen/include/asm-x86/hvm/vmx/vmcs.h Thu Jun 19 12:48:04 2008 +0900 87.2 +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Wed Jul 02 11:30:37 2008 +0900 87.3 @@ -333,10 +333,10 @@ enum vmcs_field { 87.4 #define VMCS_VPID_WIDTH 16 87.5 87.6 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr); 87.7 -int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val); 87.8 -int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val); 87.9 -int vmx_add_guest_msr(struct vcpu *v, u32 msr); 87.10 -int vmx_add_host_load_msr(struct vcpu *v, u32 msr); 87.11 +int vmx_read_guest_msr(u32 msr, u64 *val); 87.12 +int vmx_write_guest_msr(u32 msr, u64 val); 87.13 +int vmx_add_guest_msr(u32 msr); 87.14 +int vmx_add_host_load_msr(u32 msr); 87.15 87.16 #endif /* ASM_X86_HVM_VMX_VMCS_H__ */ 87.17
88.1 --- a/xen/include/asm-x86/mm.h Thu Jun 19 12:48:04 2008 +0900 88.2 +++ b/xen/include/asm-x86/mm.h Wed Jul 02 11:30:37 2008 +0900 88.3 @@ -130,6 +130,14 @@ static inline u32 pickle_domptr(struct d 88.4 /* The order of the largest allocation unit we use for shadow pages */ 88.5 #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */ 88.6 88.7 +/* The number of out-of-sync shadows we allow per vcpu (prime, please) */ 88.8 +#define SHADOW_OOS_PAGES 3 88.9 + 88.10 +/* The order OOS fixup tables per vcpu */ 88.11 +#define SHADOW_OOS_FT_ORDER 1 88.12 +/* OOS fixup tables hash entries */ 88.13 +#define SHADOW_OOS_FT_HASH 13 88.14 + 88.15 #define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) 88.16 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d)) 88.17
89.1 --- a/xen/include/asm-x86/perfc_defn.h Thu Jun 19 12:48:04 2008 +0900 89.2 +++ b/xen/include/asm-x86/perfc_defn.h Wed Jul 02 11:30:37 2008 +0900 89.3 @@ -80,7 +80,11 @@ PERFCOUNTER(shadow_writeable_h_2, "shad 89.4 PERFCOUNTER(shadow_writeable_h_3, "shadow writeable: 64b w2k3") 89.5 PERFCOUNTER(shadow_writeable_h_4, "shadow writeable: linux low/solaris") 89.6 PERFCOUNTER(shadow_writeable_h_5, "shadow writeable: linux high") 89.7 +PERFCOUNTER(shadow_writeable_h_6, "shadow writeable: unsync va") 89.8 +PERFCOUNTER(shadow_writeable_h_7, "shadow writeable: sl1p") 89.9 +PERFCOUNTER(shadow_writeable_h_8, "shadow writeable: sl1p failed") 89.10 PERFCOUNTER(shadow_writeable_bf, "shadow writeable brute-force") 89.11 +PERFCOUNTER(shadow_writeable_bf_1, "shadow writeable resync bf") 89.12 PERFCOUNTER(shadow_mappings, "shadow removes all mappings") 89.13 PERFCOUNTER(shadow_mappings_bf, "shadow rm-mappings brute-force") 89.14 PERFCOUNTER(shadow_early_unshadow, "shadow unshadows for fork/exit") 89.15 @@ -101,4 +105,15 @@ PERFCOUNTER(shadow_em_ex_pt, "shad 89.16 PERFCOUNTER(shadow_em_ex_non_pt, "shadow extra non-pt-write op") 89.17 PERFCOUNTER(shadow_em_ex_fail, "shadow extra emulation failed") 89.18 89.19 +PERFCOUNTER(shadow_oos_fixup_add_ok, "shadow OOS fixups adds") 89.20 +PERFCOUNTER(shadow_oos_fixup_no_add, "shadow OOS fixups no adds") 89.21 +PERFCOUNTER(shadow_oos_fixup_add_fail, "shadow OOS fixups adds failed") 89.22 +PERFCOUNTER(shadow_oos_fixup_remove, "shadow OOS fixups removes") 89.23 +PERFCOUNTER(shadow_oos_fixup_flush, "shadow OOS fixups flushes") 89.24 +PERFCOUNTER(shadow_oos_fixup_flush_gmfn,"shadow OOS fixups gmfn flushes") 89.25 + 89.26 +PERFCOUNTER(shadow_unsync, "shadow OOS unsyncs") 89.27 +PERFCOUNTER(shadow_unsync_evict, "shadow OOS evictions") 89.28 +PERFCOUNTER(shadow_resync, "shadow OOS resyncs") 89.29 + 89.30 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
90.1 --- a/xen/include/public/hvm/hvm_op.h Thu Jun 19 12:48:04 2008 +0900 90.2 +++ b/xen/include/public/hvm/hvm_op.h Wed Jul 02 11:30:37 2008 +0900 90.3 @@ -92,6 +92,19 @@ struct xen_hvm_track_dirty_vram { 90.4 typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t; 90.5 DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t); 90.6 90.7 +/* Notify that some pages got modified by the Device Model. */ 90.8 +#define HVMOP_modified_memory 7 90.9 +struct xen_hvm_modified_memory { 90.10 + /* Domain to be updated. */ 90.11 + domid_t domid; 90.12 + /* First pfn. */ 90.13 + uint64_aligned_t first_pfn; 90.14 + /* Number of pages. */ 90.15 + uint64_aligned_t nr; 90.16 +}; 90.17 +typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t; 90.18 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t); 90.19 + 90.20 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ 90.21 90.22 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
91.1 --- a/xen/include/xen/domain.h Thu Jun 19 12:48:04 2008 +0900 91.2 +++ b/xen/include/xen/domain.h Wed Jul 02 11:30:37 2008 +0900 91.3 @@ -16,9 +16,6 @@ int boot_vcpu( 91.4 struct vcpu *alloc_idle_vcpu(unsigned int cpu_id); 91.5 void vcpu_reset(struct vcpu *v); 91.6 91.7 -struct domain *alloc_domain(domid_t domid); 91.8 -void free_domain(struct domain *d); 91.9 - 91.10 struct xen_domctl_getdomaininfo; 91.11 void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info); 91.12
92.1 --- a/xen/include/xen/sched.h Thu Jun 19 12:48:04 2008 +0900 92.2 +++ b/xen/include/xen/sched.h Wed Jul 02 11:30:37 2008 +0900 92.3 @@ -315,10 +315,14 @@ static inline struct domain *get_current 92.4 struct domain *domain_create( 92.5 domid_t domid, unsigned int domcr_flags, ssidref_t ssidref); 92.6 /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */ 92.7 -#define _DOMCRF_hvm 0 92.8 -#define DOMCRF_hvm (1U<<_DOMCRF_hvm) 92.9 -#define _DOMCRF_hap 1 92.10 -#define DOMCRF_hap (1U<<_DOMCRF_hap) 92.11 +#define _DOMCRF_hvm 0 92.12 +#define DOMCRF_hvm (1U<<_DOMCRF_hvm) 92.13 + /* DOMCRF_hap: Create a domain with hardware-assisted paging. */ 92.14 +#define _DOMCRF_hap 1 92.15 +#define DOMCRF_hap (1U<<_DOMCRF_hap) 92.16 + /* DOMCRF_dummy: Create a dummy domain (not scheduled; not on domain list) */ 92.17 +#define _DOMCRF_dummy 2 92.18 +#define DOMCRF_dummy (1U<<_DOMCRF_dummy) 92.19 92.20 int construct_dom0( 92.21 struct domain *d,