debuggers.hg
changeset 3767:f5f2757b3aa2
bitkeeper revision 1.1159.1.545 (4208ec60-ql2CB2KKyZRC_8udlW9kA)
Merge tempest.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xeno.bk
into tempest.cl.cam.ac.uk:/local/scratch/smh22/xen-unstable.bk
Merge tempest.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xeno.bk
into tempest.cl.cam.ac.uk:/local/scratch/smh22/xen-unstable.bk
line diff
1.1 --- a/.rootkeys Mon Feb 07 08:19:24 2005 +0000 1.2 +++ b/.rootkeys Tue Feb 08 16:44:16 2005 +0000 1.3 @@ -867,8 +867,8 @@ 3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/ 1.4 3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/x86/idle0_task.c 1.5 3ddb79bcKIkRR0kqWaJhe5VUDkMdxg xen/arch/x86/io_apic.c 1.6 3ddb79bdqfIcjkz_h9Hvtp8Tk_19Zw xen/arch/x86/irq.c 1.7 -40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/memory.c 1.8 41d54a76qfpO0VnbL2tYs0Jgt3W3XA xen/arch/x86/microcode.c 1.9 +40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/mm.c 1.10 3ddb79bdS4UeWWXDH-FaBKqcpMFcnw xen/arch/x86/mpparse.c 1.11 41aaf566Z4sTDgJ77eEg0TzzQ1ka6Q xen/arch/x86/mtrr/amd.c 1.12 41aaf566TOpOBXT00wwQGUh20f1rlA xen/arch/x86/mtrr/centaur.c 1.13 @@ -1038,6 +1038,7 @@ 41c0c412lQ0NVVN9PsOSznQ-qhOiPA xen/inclu 1.14 418fbcfe_WliJPToeVM-9VStvym-hw xen/include/asm-x86/x86_32/asm_defns.h 1.15 3ddb79c2ADvRmdexd9y3AYK9_NTx-Q xen/include/asm-x86/x86_32/current.h 1.16 3e20b82fl1jmQiKdLy7fxMcutfpjWA xen/include/asm-x86/x86_32/domain_page.h 1.17 +4208e2a3ZNFroNXbX9OYaOB-xtUyDQ xen/include/asm-x86/x86_32/page.h 1.18 3ddb79c3mbqEM7QQr3zVq7NiBNhouA xen/include/asm-x86/x86_32/regs.h 1.19 3e7f358aG11EvMI9VJ4_9hD4LUO7rQ xen/include/asm-x86/x86_32/string.h 1.20 3ddb79c3M2n1ROZH6xk3HbyN4CPDqg xen/include/asm-x86/x86_32/uaccess.h 1.21 @@ -1045,6 +1046,7 @@ 41bf1717bML6GxpclTWJabiaO5W5vg xen/inclu 1.22 404f1b9ceJeGVaPNIENm2FkK0AgEOQ xen/include/asm-x86/x86_64/current.h 1.23 41febc4b1aCGLsm0Y0b_82h7lFtrEA xen/include/asm-x86/x86_64/domain_page.h 1.24 404f1badfXZJZ2sU8sh9PS2EZvd19Q xen/include/asm-x86/x86_64/ldt.h 1.25 +4208e2a3Fktw4ZttKdDxbhvTQ6brfQ xen/include/asm-x86/x86_64/page.h 1.26 404f1bb86rAXB3aLS1vYdcqpJiEcyg xen/include/asm-x86/x86_64/regs.h 1.27 40e1966azOJZfNI6Ilthe6Q-T3Hewg xen/include/asm-x86/x86_64/string.h 1.28 404f1bc4tWkB9Qr8RkKtZGW5eMQzhw xen/include/asm-x86/x86_64/uaccess.h
2.1 --- a/linux-2.4.29-xen-sparse/mm/memory.c Mon Feb 07 08:19:24 2005 +0000 2.2 +++ b/linux-2.4.29-xen-sparse/mm/memory.c Tue Feb 08 16:44:16 2005 +0000 2.3 @@ -915,7 +915,7 @@ static inline void establish_pte(struct 2.4 #ifdef CONFIG_XEN 2.5 if ( likely(vma->vm_mm == current->mm) ) { 2.6 XEN_flush_page_update_queue(); 2.7 - HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, UVMF_INVLPG); 2.8 + HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG); 2.9 } else { 2.10 set_pte(page_table, entry); 2.11 flush_tlb_page(vma, address); 2.12 @@ -1191,7 +1191,7 @@ static int do_swap_page(struct mm_struct 2.13 #ifdef CONFIG_XEN 2.14 if ( likely(vma->vm_mm == current->mm) ) { 2.15 XEN_flush_page_update_queue(); 2.16 - HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, pte, 0); 2.17 + HYPERVISOR_update_va_mapping(address, pte, 0); 2.18 } else { 2.19 set_pte(page_table, pte); 2.20 XEN_flush_page_update_queue(); 2.21 @@ -1247,7 +1247,7 @@ static int do_anonymous_page(struct mm_s 2.22 #ifdef CONFIG_XEN 2.23 if ( likely(vma->vm_mm == current->mm) ) { 2.24 XEN_flush_page_update_queue(); 2.25 - HYPERVISOR_update_va_mapping(addr>>PAGE_SHIFT, entry, 0); 2.26 + HYPERVISOR_update_va_mapping(addr, entry, 0); 2.27 } else { 2.28 set_pte(page_table, entry); 2.29 XEN_flush_page_update_queue(); 2.30 @@ -1333,7 +1333,7 @@ static int do_no_page(struct mm_struct * 2.31 #ifdef CONFIG_XEN 2.32 if ( likely(vma->vm_mm == current->mm) ) { 2.33 XEN_flush_page_update_queue(); 2.34 - HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, 0); 2.35 + HYPERVISOR_update_va_mapping(address, entry, 0); 2.36 } else { 2.37 set_pte(page_table, entry); 2.38 XEN_flush_page_update_queue();
3.1 --- a/linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c Mon Feb 07 08:19:24 2005 +0000 3.2 +++ b/linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c Tue Feb 08 16:44:16 2005 +0000 3.3 @@ -229,7 +229,9 @@ fastcall void do_page_fault(struct pt_re 3.4 /* Set the "privileged fault" bit to something sane. */ 3.5 error_code &= 3; 3.6 error_code |= (regs->xcs & 2) << 1; 3.7 - 3.8 + if (regs->eflags & X86_EFLAGS_VM) 3.9 + error_code |= 4; 3.10 + 3.11 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 3.12 SIGSEGV) == NOTIFY_STOP) 3.13 return;
4.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c Mon Feb 07 08:19:24 2005 +0000 4.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c Tue Feb 08 16:44:16 2005 +0000 4.3 @@ -95,7 +95,7 @@ static void fast_flush_area(int idx, int 4.4 for ( i = 0; i < nr_pages; i++ ) 4.5 { 4.6 mcl[i].op = __HYPERVISOR_update_va_mapping; 4.7 - mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT; 4.8 + mcl[i].args[0] = MMAP_VADDR(idx, i); 4.9 mcl[i].args[1] = 0; 4.10 mcl[i].args[2] = 0; 4.11 } 4.12 @@ -343,14 +343,14 @@ static void dispatch_probe(blkif_t *blki 4.13 4.14 #ifdef CONFIG_XEN_BLKDEV_TAP_BE 4.15 if ( HYPERVISOR_update_va_mapping_otherdomain( 4.16 - MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT, 4.17 + MMAP_VADDR(pending_idx, 0), 4.18 (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, 4.19 0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) ) 4.20 4.21 goto out; 4.22 #else 4.23 if ( HYPERVISOR_update_va_mapping_otherdomain( 4.24 - MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT, 4.25 + MMAP_VADDR(pending_idx, 0), 4.26 (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, 4.27 0, blkif->domid) ) 4.28 4.29 @@ -436,7 +436,7 @@ static void dispatch_rw_block_io(blkif_t 4.30 for ( i = 0; i < nr_psegs; i++ ) 4.31 { 4.32 mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; 4.33 - mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; 4.34 + mcl[i].args[0] = MMAP_VADDR(pending_idx, i); 4.35 mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot; 4.36 mcl[i].args[2] = 0; 4.37 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
5.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.c Mon Feb 07 08:19:24 2005 +0000 5.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.c Tue Feb 08 16:44:16 2005 +0000 5.3 @@ -16,7 +16,7 @@ 5.4 5.5 #include "blktap.h" 5.6 5.7 -int __init xlblk_init(void) 5.8 +int __init xlblktap_init(void) 5.9 { 5.10 ctrl_msg_t cmsg; 5.11 blkif_fe_driver_status_t fe_st; 5.12 @@ -64,6 +64,7 @@ int __init xlblk_init(void) 5.13 return 0; 5.14 } 5.15 5.16 +#if 0 /* tap doesn't handle suspend/resume */ 5.17 void blkdev_suspend(void) 5.18 { 5.19 } 5.20 @@ -81,6 +82,6 @@ void blkdev_resume(void) 5.21 memcpy(cmsg.msg, &st, sizeof(st)); 5.22 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); 5.23 } 5.24 - 5.25 +#endif 5.26 5.27 -__initcall(xlblk_init); 5.28 +__initcall(xlblktap_init);
6.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.h Mon Feb 07 08:19:24 2005 +0000 6.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.h Tue Feb 08 16:44:16 2005 +0000 6.3 @@ -48,6 +48,12 @@ 6.4 #define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args) 6.5 6.6 6.7 +/* -------[ state descriptors ]--------------------------------------- */ 6.8 + 6.9 +#define BLKIF_STATE_CLOSED 0 6.10 +#define BLKIF_STATE_DISCONNECTED 1 6.11 +#define BLKIF_STATE_CONNECTED 2 6.12 + 6.13 /* -------[ connection tracking ]------------------------------------- */ 6.14 6.15 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) 6.16 @@ -99,7 +105,6 @@ typedef struct { 6.17 unsigned long mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 6.18 unsigned long virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 6.19 int next_free; 6.20 - int inuse; /* debugging */ 6.21 } active_req_t; 6.22 6.23 typedef unsigned int ACTIVE_RING_IDX; 6.24 @@ -181,7 +186,7 @@ extern unsigned long mmap_vstart; 6.25 * for shared memory rings. 6.26 */ 6.27 6.28 -#define RING_PAGES 128 6.29 +#define RING_PAGES 3 /* Ctrl, Front, and Back */ 6.30 extern unsigned long rings_vstart; 6.31 6.32 6.33 @@ -190,11 +195,10 @@ extern unsigned long blktap_mode; 6.34 6.35 /* Connection to a single backend domain. */ 6.36 extern blkif_front_ring_t blktap_be_ring; 6.37 +extern unsigned int blktap_be_evtchn; 6.38 +extern unsigned int blktap_be_state; 6.39 6.40 -/* Event channel to backend domain. */ 6.41 -extern unsigned int blkif_ptbe_evtchn; 6.42 - 6.43 -/* User ring status... this will soon vanish into a ring struct. */ 6.44 +/* User ring status. */ 6.45 extern unsigned long blktap_ring_ok; 6.46 6.47 /* -------[ ...and function prototypes. ]----------------------------- */ 6.48 @@ -213,8 +217,7 @@ void blktap_kick_user(void); 6.49 /* user ring access functions: */ 6.50 int blktap_write_fe_ring(blkif_request_t *req); 6.51 int blktap_write_be_ring(blkif_response_t *rsp); 6.52 -int blktap_read_fe_ring(void); 6.53 -int blktap_read_be_ring(void); 6.54 +int blktap_write_ctrl_ring(ctrl_msg_t *msg); 6.55 6.56 /* fe/be ring access functions: */ 6.57 int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp);
7.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c Mon Feb 07 08:19:24 2005 +0000 7.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c Tue Feb 08 16:44:16 2005 +0000 7.3 @@ -10,10 +10,6 @@ 7.4 7.5 #include "blktap.h" 7.6 7.7 -#define BLKIF_STATE_CLOSED 0 7.8 -#define BLKIF_STATE_DISCONNECTED 1 7.9 -#define BLKIF_STATE_CONNECTED 2 7.10 - 7.11 static char *blkif_state_name[] = { 7.12 [BLKIF_STATE_CLOSED] = "closed", 7.13 [BLKIF_STATE_DISCONNECTED] = "disconnected", 7.14 @@ -26,9 +22,10 @@ static char * blkif_status_name[] = { 7.15 [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected", 7.16 [BLKIF_INTERFACE_STATUS_CHANGED] = "changed", 7.17 }; 7.18 -static unsigned int blkif_pt_state = BLKIF_STATE_CLOSED; 7.19 -static unsigned blkif_ptbe_irq; 7.20 -unsigned int blkif_ptbe_evtchn; 7.21 + 7.22 +static unsigned blktap_be_irq; 7.23 +unsigned int blktap_be_state = BLKIF_STATE_CLOSED; 7.24 +unsigned int blktap_be_evtchn; 7.25 7.26 /*-----[ Control Messages to/from Frontend VMs ]--------------------------*/ 7.27 7.28 @@ -306,7 +303,7 @@ static void blkif_ptbe_disconnect(void) 7.29 sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL); 7.30 SHARED_RING_INIT(BLKIF_RING, sring); 7.31 FRONT_RING_INIT(BLKIF_RING, &blktap_be_ring, sring); 7.32 - blkif_pt_state = BLKIF_STATE_DISCONNECTED; 7.33 + blktap_be_state = BLKIF_STATE_DISCONNECTED; 7.34 DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n"); 7.35 blkif_ptbe_send_interface_connect(); 7.36 } 7.37 @@ -315,10 +312,10 @@ static void blkif_ptbe_connect(blkif_fe_ 7.38 { 7.39 int err = 0; 7.40 7.41 - blkif_ptbe_evtchn = status->evtchn; 7.42 - blkif_ptbe_irq = bind_evtchn_to_irq(blkif_ptbe_evtchn); 7.43 + blktap_be_evtchn = status->evtchn; 7.44 + blktap_be_irq = bind_evtchn_to_irq(blktap_be_evtchn); 7.45 7.46 - err = request_irq(blkif_ptbe_irq, blkif_ptbe_int, 7.47 + err = request_irq(blktap_be_irq, blkif_ptbe_int, 7.48 SA_SAMPLE_RANDOM, "blkif", NULL); 7.49 if ( err ) { 7.50 WPRINTK("blkfront request_irq failed (%d)\n", err); 7.51 @@ -326,7 +323,7 @@ static void blkif_ptbe_connect(blkif_fe_ 7.52 } else { 7.53 /* transtion to connected in case we need to do a 7.54 a partion probe on a whole disk */ 7.55 - blkif_pt_state = BLKIF_STATE_CONNECTED; 7.56 + blktap_be_state = BLKIF_STATE_CONNECTED; 7.57 } 7.58 } 7.59 7.60 @@ -334,7 +331,7 @@ static void unexpected(blkif_fe_interfac 7.61 { 7.62 WPRINTK(" TAP: Unexpected blkif status %s in state %s\n", 7.63 blkif_status_name[status->status], 7.64 - blkif_state_name[blkif_pt_state]); 7.65 + blkif_state_name[blktap_be_state]); 7.66 } 7.67 7.68 static void blkif_ptbe_status( 7.69 @@ -352,7 +349,7 @@ static void blkif_ptbe_status( 7.70 switch ( status->status ) 7.71 { 7.72 case BLKIF_INTERFACE_STATUS_CLOSED: 7.73 - switch ( blkif_pt_state ) 7.74 + switch ( blktap_be_state ) 7.75 { 7.76 case BLKIF_STATE_CLOSED: 7.77 unexpected(status); 7.78 @@ -366,7 +363,7 @@ static void blkif_ptbe_status( 7.79 break; 7.80 7.81 case BLKIF_INTERFACE_STATUS_DISCONNECTED: 7.82 - switch ( blkif_pt_state ) 7.83 + switch ( blktap_be_state ) 7.84 { 7.85 case BLKIF_STATE_CLOSED: 7.86 blkif_ptbe_disconnect(); 7.87 @@ -380,7 +377,7 @@ static void blkif_ptbe_status( 7.88 break; 7.89 7.90 case BLKIF_INTERFACE_STATUS_CONNECTED: 7.91 - switch ( blkif_pt_state ) 7.92 + switch ( blktap_be_state ) 7.93 { 7.94 case BLKIF_STATE_CLOSED: 7.95 unexpected(status); 7.96 @@ -398,7 +395,7 @@ static void blkif_ptbe_status( 7.97 break; 7.98 7.99 case BLKIF_INTERFACE_STATUS_CHANGED: 7.100 - switch ( blkif_pt_state ) 7.101 + switch ( blktap_be_state ) 7.102 { 7.103 case BLKIF_STATE_CLOSED: 7.104 case BLKIF_STATE_DISCONNECTED: 7.105 @@ -440,6 +437,14 @@ void blkif_ctrlif_rx(ctrl_msg_t *msg, un 7.106 7.107 case CMSG_BLKIF_BE: 7.108 7.109 + /* send a copy of the message to user if wanted */ 7.110 + 7.111 + if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) || 7.112 + (blktap_mode & BLKTAP_MODE_COPY_FE) ) { 7.113 + 7.114 + blktap_write_ctrl_ring(msg); 7.115 + } 7.116 + 7.117 switch ( msg->subtype ) 7.118 { 7.119 case CMSG_BLKIF_BE_CREATE: 7.120 @@ -500,11 +505,13 @@ void blkif_ctrlif_rx(ctrl_msg_t *msg, un 7.121 ctrl_if_send_response(msg); 7.122 } 7.123 7.124 -/*-----[ All control messages enter here: ]-------------------------------*/ 7.125 +/*-----[ Initialization ]-------------------------------------------------*/ 7.126 7.127 void __init blkif_interface_init(void) 7.128 { 7.129 blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 7.130 0, 0, NULL, NULL); 7.131 memset(blkif_hash, 0, sizeof(blkif_hash)); 7.132 + 7.133 + blktap_be_ring.sring = NULL; 7.134 }
8.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_datapath.c Mon Feb 07 08:19:24 2005 +0000 8.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_datapath.c Tue Feb 08 16:44:16 2005 +0000 8.3 @@ -40,8 +40,6 @@ inline active_req_t *get_active_req(void 8.4 spin_lock_irqsave(&active_req_lock, flags); 8.5 idx = active_req_ring[MASK_ACTIVE_IDX(active_cons++)]; 8.6 ar = &active_reqs[idx]; 8.7 -if (ar->inuse) WPRINTK("AR INUSE! (%lu)\n", ar->id); 8.8 -ar->inuse = 1; 8.9 spin_unlock_irqrestore(&active_req_lock, flags); 8.10 8.11 return ar; 8.12 @@ -52,7 +50,6 @@ inline void free_active_req(active_req_t 8.13 unsigned long flags; 8.14 8.15 spin_lock_irqsave(&active_req_lock, flags); 8.16 -ar->inuse = 0; 8.17 active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar); 8.18 spin_unlock_irqrestore(&active_req_lock, flags); 8.19 } 8.20 @@ -97,11 +94,8 @@ inline int write_resp_to_fe_ring(blkif_t 8.21 blkif_response_t *resp_d; 8.22 active_req_t *ar; 8.23 8.24 - /* remap id, and free the active req. blkif lookup goes here too.*/ 8.25 ar = &active_reqs[ID_TO_IDX(rsp->id)]; 8.26 - /* WPRINTK("%3u > %3lu\n", ID_TO_IDX(rsp->id), ar->id); */ 8.27 rsp->id = ar->id; 8.28 - free_active_req(ar); 8.29 8.30 resp_d = RING_GET_RESPONSE(BLKIF_RING, &blkif->blk_ring, 8.31 blkif->blk_ring.rsp_prod_pvt); 8.32 @@ -109,6 +103,9 @@ inline int write_resp_to_fe_ring(blkif_t 8.33 wmb(); 8.34 blkif->blk_ring.rsp_prod_pvt++; 8.35 8.36 + blkif_put(ar->blkif); 8.37 + free_active_req(ar); 8.38 + 8.39 return 0; 8.40 } 8.41 8.42 @@ -116,6 +113,11 @@ inline int write_req_to_be_ring(blkif_re 8.43 { 8.44 blkif_request_t *req_d; 8.45 8.46 + if ( blktap_be_state != BLKIF_STATE_CONNECTED ) { 8.47 + WPRINTK("Tap trying to access an unconnected backend!\n"); 8.48 + return 0; 8.49 + } 8.50 + 8.51 req_d = RING_GET_REQUEST(BLKIF_RING, &blktap_be_ring, 8.52 blktap_be_ring.req_prod_pvt); 8.53 memcpy(req_d, req, sizeof(blkif_request_t)); 8.54 @@ -135,9 +137,12 @@ inline void kick_fe_domain(blkif_t *blki 8.55 8.56 inline void kick_be_domain(void) 8.57 { 8.58 + if ( blktap_be_state != BLKIF_STATE_CONNECTED ) 8.59 + return; 8.60 + 8.61 wmb(); /* Ensure that the frontend can see the requests. */ 8.62 RING_PUSH_REQUESTS(BLKIF_RING, &blktap_be_ring); 8.63 - notify_via_evtchn(blkif_ptbe_evtchn); 8.64 + notify_via_evtchn(blktap_be_evtchn); 8.65 DPRINTK("notified BE\n"); 8.66 } 8.67 8.68 @@ -310,6 +315,7 @@ static int do_block_io_op(blkif_t *blkif 8.69 */ 8.70 ar = get_active_req(); 8.71 ar->id = req_s->id; 8.72 + blkif_get(blkif); 8.73 ar->blkif = blkif; 8.74 req_s->id = MAKE_ID(blkif->domid, ACTIVE_IDX(ar)); 8.75 /* WPRINTK("%3u < %3lu\n", ID_TO_IDX(req_s->id), ar->id); */ 8.76 @@ -458,11 +464,13 @@ void print_vm_ring_idxs(void) 8.77 blkif->blk_ring.sring->req_prod, 8.78 blkif->blk_ring.sring->rsp_prod); 8.79 } 8.80 - WPRINTK("BE Ring: \n--------\n"); 8.81 - WPRINTK("BE: rsp_cons: %2d, req_prod_prv: %2d " 8.82 - "| req_prod: %2d, rsp_prod: %2d\n", 8.83 - blktap_be_ring.rsp_cons, 8.84 - blktap_be_ring.req_prod_pvt, 8.85 - blktap_be_ring.sring->req_prod, 8.86 - blktap_be_ring.sring->rsp_prod); 8.87 + if (blktap_be_ring.sring != NULL) { 8.88 + WPRINTK("BE Ring: \n--------\n"); 8.89 + WPRINTK("BE: rsp_cons: %2d, req_prod_prv: %2d " 8.90 + "| req_prod: %2d, rsp_prod: %2d\n", 8.91 + blktap_be_ring.rsp_cons, 8.92 + blktap_be_ring.req_prod_pvt, 8.93 + blktap_be_ring.sring->req_prod, 8.94 + blktap_be_ring.sring->rsp_prod); 8.95 + } 8.96 }
9.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_userdev.c Mon Feb 07 08:19:24 2005 +0000 9.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_userdev.c Tue Feb 08 16:44:16 2005 +0000 9.3 @@ -19,6 +19,7 @@ 9.4 #include <linux/gfp.h> 9.5 #include <linux/poll.h> 9.6 #include <asm/pgalloc.h> 9.7 +#include <asm-xen/xen-public/io/blkif.h> /* for control ring. */ 9.8 9.9 #include "blktap.h" 9.10 9.11 @@ -40,6 +41,11 @@ unsigned long rings_vstart; 9.12 /* Rings up to user space. */ 9.13 static blkif_front_ring_t blktap_ufe_ring; 9.14 static blkif_back_ring_t blktap_ube_ring; 9.15 +static ctrl_front_ring_t blktap_uctrl_ring; 9.16 + 9.17 +/* local prototypes */ 9.18 +static int blktap_read_fe_ring(void); 9.19 +static int blktap_read_be_ring(void); 9.20 9.21 /* -------[ blktap vm ops ]------------------------------------------- */ 9.22 9.23 @@ -66,16 +72,28 @@ struct vm_operations_struct blktap_vm_op 9.24 static int blktap_open(struct inode *inode, struct file *filp) 9.25 { 9.26 blkif_sring_t *sring; 9.27 + ctrl_sring_t *csring; 9.28 9.29 if ( test_and_set_bit(0, &blktap_dev_inuse) ) 9.30 return -EBUSY; 9.31 9.32 printk(KERN_ALERT "blktap open.\n"); 9.33 + 9.34 + /* Allocate the ctrl ring. */ 9.35 + csring = (ctrl_sring_t *)get_zeroed_page(GFP_KERNEL); 9.36 + if (csring == NULL) 9.37 + goto fail_nomem; 9.38 + 9.39 + SetPageReserved(virt_to_page(csring)); 9.40 + 9.41 + SHARED_RING_INIT(CTRL_RING, csring); 9.42 + FRONT_RING_INIT(CTRL_RING, &blktap_uctrl_ring, csring); 9.43 + 9.44 9.45 /* Allocate the fe ring. */ 9.46 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); 9.47 if (sring == NULL) 9.48 - goto fail_nomem; 9.49 + goto fail_free_ctrl; 9.50 9.51 SetPageReserved(virt_to_page(sring)); 9.52 9.53 @@ -95,6 +113,9 @@ static int blktap_open(struct inode *ino 9.54 DPRINTK(KERN_ALERT "blktap open.\n"); 9.55 9.56 return 0; 9.57 + 9.58 + fail_free_ctrl: 9.59 + free_page( (unsigned long) blktap_uctrl_ring.sring); 9.60 9.61 fail_free_fe: 9.62 free_page( (unsigned long) blktap_ufe_ring.sring); 9.63 @@ -111,6 +132,9 @@ static int blktap_release(struct inode * 9.64 printk(KERN_ALERT "blktap closed.\n"); 9.65 9.66 /* Free the ring page. */ 9.67 + ClearPageReserved(virt_to_page(blktap_uctrl_ring.sring)); 9.68 + free_page((unsigned long) blktap_uctrl_ring.sring); 9.69 + 9.70 ClearPageReserved(virt_to_page(blktap_ufe_ring.sring)); 9.71 free_page((unsigned long) blktap_ufe_ring.sring); 9.72 9.73 @@ -120,6 +144,15 @@ static int blktap_release(struct inode * 9.74 return 0; 9.75 } 9.76 9.77 +/* Note on mmap: 9.78 + * remap_pfn_range sets VM_IO on vma->vm_flags. In trying to make libaio 9.79 + * work to do direct page access from userspace, this ended up being a 9.80 + * problem. The bigger issue seems to be that there is no way to map 9.81 + * a foreign page in to user space and have the virtual address of that 9.82 + * page map sanely down to a mfn. 9.83 + * Removing the VM_IO flag results in a loop in get_user_pages, as 9.84 + * pfn_valid() always fails on a foreign page. 9.85 + */ 9.86 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) 9.87 { 9.88 int size; 9.89 @@ -148,20 +181,28 @@ static int blktap_mmap(struct file *filp 9.90 /* not sure if I really need to do this... */ 9.91 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 9.92 9.93 + DPRINTK("Mapping ctrl_ring page %lx.\n", __pa(blktap_uctrl_ring.sring)); 9.94 + if (remap_pfn_range(vma, vma->vm_start, 9.95 + __pa(blktap_uctrl_ring.sring) >> PAGE_SHIFT, 9.96 + PAGE_SIZE, vma->vm_page_prot)) { 9.97 + WPRINTK("ctrl_ring: remap_pfn_range failure!\n"); 9.98 + } 9.99 + 9.100 + 9.101 DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring)); 9.102 - if (remap_page_range(vma, vma->vm_start, 9.103 - __pa(blktap_ube_ring.sring), 9.104 + if (remap_pfn_range(vma, vma->vm_start + PAGE_SIZE, 9.105 + __pa(blktap_ube_ring.sring) >> PAGE_SHIFT, 9.106 PAGE_SIZE, vma->vm_page_prot)) { 9.107 - WPRINTK("be_ring: remap_page_range failure!\n"); 9.108 + WPRINTK("be_ring: remap_pfn_range failure!\n"); 9.109 } 9.110 9.111 DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring)); 9.112 - if (remap_page_range(vma, vma->vm_start + PAGE_SIZE, 9.113 - __pa(blktap_ufe_ring.sring), 9.114 + if (remap_pfn_range(vma, vma->vm_start + ( 2 * PAGE_SIZE ), 9.115 + __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, 9.116 PAGE_SIZE, vma->vm_page_prot)) { 9.117 - WPRINTK("fe_ring: remap_page_range failure!\n"); 9.118 + WPRINTK("fe_ring: remap_pfn_range failure!\n"); 9.119 } 9.120 - 9.121 + 9.122 blktap_vma = vma; 9.123 blktap_ring_ok = 1; 9.124 9.125 @@ -211,9 +252,11 @@ static unsigned int blktap_poll(struct f 9.126 { 9.127 poll_wait(file, &blktap_wait, wait); 9.128 9.129 - if ( RING_HAS_UNPUSHED_REQUESTS(BLKIF_RING, &blktap_ufe_ring) || 9.130 + if ( RING_HAS_UNPUSHED_REQUESTS(BLKIF_RING, &blktap_uctrl_ring) || 9.131 + RING_HAS_UNPUSHED_REQUESTS(BLKIF_RING, &blktap_ufe_ring) || 9.132 RING_HAS_UNPUSHED_RESPONSES(BLKIF_RING, &blktap_ube_ring) ) { 9.133 9.134 + RING_PUSH_REQUESTS(BLKIF_RING, &blktap_uctrl_ring); 9.135 RING_PUSH_REQUESTS(BLKIF_RING, &blktap_ufe_ring); 9.136 RING_PUSH_RESPONSES(BLKIF_RING, &blktap_ube_ring); 9.137 return POLLIN | POLLRDNORM; 9.138 @@ -260,7 +303,6 @@ int blktap_write_fe_ring(blkif_request_t 9.139 return 0; 9.140 } 9.141 9.142 - //target = RING_NEXT_EMPTY_REQUEST(BLKIF_RING, &blktap_ufe_ring); 9.143 target = RING_GET_REQUEST(BLKIF_RING, &blktap_ufe_ring, 9.144 blktap_ufe_ring.req_prod_pvt); 9.145 memcpy(target, req, sizeof(*req)); 9.146 @@ -270,7 +312,7 @@ int blktap_write_fe_ring(blkif_request_t 9.147 9.148 error = direct_remap_area_pages(blktap_vma->vm_mm, 9.149 MMAP_VADDR(ID_TO_IDX(req->id), i), 9.150 - target->frame_and_sects[0] & PAGE_MASK, 9.151 + target->frame_and_sects[i] & PAGE_MASK, 9.152 PAGE_SIZE, 9.153 blktap_vma->vm_page_prot, 9.154 ID_TO_DOM(req->id)); 9.155 @@ -302,7 +344,6 @@ int blktap_write_be_ring(blkif_response_ 9.156 9.157 /* No test for fullness in the response direction. */ 9.158 9.159 - //target = RING_NEXT_EMPTY_RESPONSE(BLKIF_RING, &blktap_ube_ring); 9.160 target = RING_GET_RESPONSE(BLKIF_RING, &blktap_ube_ring, 9.161 blktap_ube_ring.rsp_prod_pvt); 9.162 memcpy(target, rsp, sizeof(*rsp)); 9.163 @@ -314,7 +355,7 @@ int blktap_write_be_ring(blkif_response_ 9.164 return 0; 9.165 } 9.166 9.167 -int blktap_read_fe_ring(void) 9.168 +static int blktap_read_fe_ring(void) 9.169 { 9.170 /* This is called to read responses from the UFE ring. */ 9.171 9.172 @@ -329,7 +370,6 @@ int blktap_read_fe_ring(void) 9.173 if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) { 9.174 9.175 /* for each outstanding message on the UFEring */ 9.176 - //RING_FOREACH_RESPONSE(BLKIF_RING, &blktap_ufe_ring, prod, resp_s) { 9.177 rp = blktap_ufe_ring.sring->rsp_prod; 9.178 rmb(); 9.179 9.180 @@ -349,7 +389,7 @@ int blktap_read_fe_ring(void) 9.181 return 0; 9.182 } 9.183 9.184 -int blktap_read_be_ring(void) 9.185 +static int blktap_read_be_ring(void) 9.186 { 9.187 /* This is called to read requests from the UBE ring. */ 9.188 9.189 @@ -362,7 +402,6 @@ int blktap_read_be_ring(void) 9.190 if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) { 9.191 9.192 /* for each outstanding message on the UFEring */ 9.193 - //RING_FOREACH_REQUEST(BLKIF_RING, &blktap_ube_ring, prod, req_s) { 9.194 rp = blktap_ube_ring.sring->req_prod; 9.195 rmb(); 9.196 for ( i = blktap_ube_ring.req_cons; i != rp; i++ ) 9.197 @@ -379,6 +418,31 @@ int blktap_read_be_ring(void) 9.198 9.199 return 0; 9.200 } 9.201 + 9.202 +int blktap_write_ctrl_ring(ctrl_msg_t *msg) 9.203 +{ 9.204 + ctrl_msg_t *target; 9.205 + 9.206 + if ( ! blktap_ring_ok ) { 9.207 + DPRINTK("blktap: be_ring not ready for a request!\n"); 9.208 + return 0; 9.209 + } 9.210 + 9.211 + /* No test for fullness in the response direction. */ 9.212 + 9.213 + target = RING_GET_REQUEST(CTRL_RING, &blktap_uctrl_ring, 9.214 + blktap_uctrl_ring.req_prod_pvt); 9.215 + memcpy(target, msg, sizeof(*msg)); 9.216 + 9.217 + blktap_uctrl_ring.req_prod_pvt++; 9.218 + 9.219 + /* currently treat the ring as unidirectional. */ 9.220 + blktap_uctrl_ring.rsp_cons = blktap_uctrl_ring.sring->rsp_prod; 9.221 + 9.222 + return 0; 9.223 + 9.224 +} 9.225 + 9.226 /* -------[ blktap module setup ]------------------------------------- */ 9.227 9.228 static struct miscdevice blktap_miscdev = {
10.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c Mon Feb 07 08:19:24 2005 +0000 10.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c Tue Feb 08 16:44:16 2005 +0000 10.3 @@ -234,7 +234,7 @@ static void net_rx_action(unsigned long 10.4 mmu[2].val = MMUEXT_REASSIGN_PAGE; 10.5 10.6 mcl[0].op = __HYPERVISOR_update_va_mapping; 10.7 - mcl[0].args[0] = vdata >> PAGE_SHIFT; 10.8 + mcl[0].args[0] = vdata; 10.9 mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL; 10.10 mcl[0].args[2] = 0; 10.11 mcl[1].op = __HYPERVISOR_mmu_update; 10.12 @@ -409,7 +409,7 @@ static void net_tx_action(unsigned long 10.13 { 10.14 pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; 10.15 mcl[0].op = __HYPERVISOR_update_va_mapping; 10.16 - mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; 10.17 + mcl[0].args[0] = MMAP_VADDR(pending_idx); 10.18 mcl[0].args[1] = 0; 10.19 mcl[0].args[2] = 0; 10.20 mcl++; 10.21 @@ -546,7 +546,7 @@ static void net_tx_action(unsigned long 10.22 skb_reserve(skb, 16); 10.23 10.24 mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain; 10.25 - mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; 10.26 + mcl[0].args[0] = MMAP_VADDR(pending_idx); 10.27 mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL; 10.28 mcl[0].args[2] = 0; 10.29 mcl[0].args[3] = netif->domid;
11.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c Mon Feb 07 08:19:24 2005 +0000 11.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c Tue Feb 08 16:44:16 2005 +0000 11.3 @@ -392,7 +392,7 @@ static void network_alloc_rx_buffers(str 11.4 = INVALID_P2M_ENTRY; 11.5 11.6 rx_mcl[i].op = __HYPERVISOR_update_va_mapping; 11.7 - rx_mcl[i].args[0] = (unsigned long)skb->head >> PAGE_SHIFT; 11.8 + rx_mcl[i].args[0] = (unsigned long)skb->head; 11.9 rx_mcl[i].args[1] = 0; 11.10 rx_mcl[i].args[2] = 0; 11.11 } 11.12 @@ -593,7 +593,7 @@ static int netif_poll(struct net_device 11.13 mmu->val = __pa(skb->head) >> PAGE_SHIFT; 11.14 mmu++; 11.15 mcl->op = __HYPERVISOR_update_va_mapping; 11.16 - mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT; 11.17 + mcl->args[0] = (unsigned long)skb->head; 11.18 mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL; 11.19 mcl->args[2] = 0; 11.20 mcl++;
12.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c Mon Feb 07 08:19:24 2005 +0000 12.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c Tue Feb 08 16:44:16 2005 +0000 12.3 @@ -191,7 +191,7 @@ static void fast_flush_area(int idx, int 12.4 for ( i = 0; i < nr_pages; i++ ) 12.5 { 12.6 mcl[i].op = __HYPERVISOR_update_va_mapping; 12.7 - mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT; 12.8 + mcl[i].args[0] = MMAP_VADDR(idx, i); 12.9 mcl[i].args[1] = 0; 12.10 mcl[i].args[2] = 0; 12.11 } 12.12 @@ -630,7 +630,7 @@ static void dispatch_usb_io(usbif_priv_t 12.13 i++, offset += PAGE_SIZE ) 12.14 { 12.15 mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; 12.16 - mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; 12.17 + mcl[i].args[0] = MMAP_VADDR(pending_idx, i); 12.18 mcl[i].args[1] = ((buffer_mach & PAGE_MASK) + offset) | remap_prot; 12.19 mcl[i].args[2] = 0; 12.20 mcl[i].args[3] = up->domid; 12.21 @@ -646,7 +646,7 @@ static void dispatch_usb_io(usbif_priv_t 12.22 { 12.23 /* Map in ISO schedule, if necessary. */ 12.24 mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; 12.25 - mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; 12.26 + mcl[i].args[0] = MMAP_VADDR(pending_idx, i); 12.27 mcl[i].args[1] = (req->iso_schedule & PAGE_MASK) | remap_prot; 12.28 mcl[i].args[2] = 0; 12.29 mcl[i].args[3] = up->domid;
13.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h Mon Feb 07 08:19:24 2005 +0000 13.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h Tue Feb 08 16:44:16 2005 +0000 13.3 @@ -426,7 +426,7 @@ extern pte_t *lookup_address(unsigned lo 13.4 if (__dirty) { \ 13.5 if ( likely((__vma)->vm_mm == current->mm) ) { \ 13.6 xen_flush_page_update_queue(); \ 13.7 - HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT, (__entry), UVMF_INVLPG); \ 13.8 + HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \ 13.9 } else { \ 13.10 xen_l1_entry_update((__ptep), (__entry).pte_low); \ 13.11 flush_tlb_page((__vma), (__address)); \ 13.12 @@ -445,7 +445,7 @@ do { \ 13.13 do { \ 13.14 if (likely((__vma)->vm_mm == current->mm)) { \ 13.15 xen_flush_page_update_queue(); \ 13.16 - HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT, \ 13.17 + HYPERVISOR_update_va_mapping((__address), \ 13.18 __entry, 0); \ 13.19 } else { \ 13.20 xen_l1_entry_update((__ptep), (__entry).pte_low); \
14.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h Mon Feb 07 08:19:24 2005 +0000 14.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h Tue Feb 08 16:44:16 2005 +0000 14.3 @@ -438,7 +438,7 @@ HYPERVISOR_multicall( 14.4 14.5 static inline int 14.6 HYPERVISOR_update_va_mapping( 14.7 - unsigned long page_nr, pte_t new_val, unsigned long flags) 14.8 + unsigned long nr, pte_t new_val, unsigned long flags) 14.9 { 14.10 int ret; 14.11 unsigned long ign1, ign2, ign3; 14.12 @@ -447,13 +447,13 @@ HYPERVISOR_update_va_mapping( 14.13 TRAP_INSTR 14.14 : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) 14.15 : "0" (__HYPERVISOR_update_va_mapping), 14.16 - "1" (page_nr), "2" ((new_val).pte_low), "3" (flags) 14.17 + "1" (va), "2" ((new_val).pte_low), "3" (flags) 14.18 : "memory" ); 14.19 14.20 if ( unlikely(ret < 0) ) 14.21 { 14.22 printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n", 14.23 - page_nr, (new_val).pte_low, flags); 14.24 + va, (new_val).pte_low, flags); 14.25 BUG(); 14.26 } 14.27 14.28 @@ -540,7 +540,7 @@ HYPERVISOR_grant_table_op( 14.29 14.30 static inline int 14.31 HYPERVISOR_update_va_mapping_otherdomain( 14.32 - unsigned long page_nr, pte_t new_val, unsigned long flags, domid_t domid) 14.33 + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) 14.34 { 14.35 int ret; 14.36 unsigned long ign1, ign2, ign3, ign4; 14.37 @@ -549,7 +549,7 @@ HYPERVISOR_update_va_mapping_otherdomain 14.38 TRAP_INSTR 14.39 : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) 14.40 : "0" (__HYPERVISOR_update_va_mapping_otherdomain), 14.41 - "1" (page_nr), "2" ((new_val).pte_low), "3" (flags), "4" (domid) : 14.42 + "1" (va), "2" ((new_val).pte_low), "3" (flags), "4" (domid) : 14.43 "memory" ); 14.44 14.45 return ret;
15.1 --- a/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h Mon Feb 07 08:19:24 2005 +0000 15.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h Tue Feb 08 16:44:16 2005 +0000 15.3 @@ -398,7 +398,7 @@ HYPERVISOR_multicall(void *call_list, in 15.4 } 15.5 15.6 static inline int 15.7 -HYPERVISOR_update_va_mapping(unsigned long page_nr, unsigned long new_val, 15.8 +HYPERVISOR_update_va_mapping(unsigned long va, unsigned long new_val, 15.9 unsigned long flags) 15.10 { 15.11 int ret; 15.12 @@ -408,12 +408,12 @@ HYPERVISOR_update_va_mapping(unsigned lo 15.13 TRAP_INSTR 15.14 : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) 15.15 : "0" (__HYPERVISOR_update_va_mapping), 15.16 - "1" (page_nr), "2" (new_val), "3" (flags) 15.17 + "1" (va), "2" (new_val), "3" (flags) 15.18 : "memory" ); 15.19 15.20 if (__predict_false(ret < 0)) 15.21 panic("Failed update VA mapping: %08lx, %08lx, %08lx", 15.22 - page_nr, new_val, flags); 15.23 + va, new_val, flags); 15.24 15.25 return ret; 15.26 } 15.27 @@ -494,7 +494,7 @@ HYPERVISOR_grant_table_op(unsigned int c 15.28 } 15.29 15.30 static inline int 15.31 -HYPERVISOR_update_va_mapping_otherdomain(unsigned long page_nr, 15.32 +HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, 15.33 unsigned long new_val, unsigned long flags, domid_t domid) 15.34 { 15.35 int ret; 15.36 @@ -504,7 +504,7 @@ HYPERVISOR_update_va_mapping_otherdomain 15.37 TRAP_INSTR 15.38 : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) 15.39 : "0" (__HYPERVISOR_update_va_mapping_otherdomain), 15.40 - "1" (page_nr), "2" (new_val), "3" (flags), "4" (domid) : 15.41 + "1" (va), "2" (new_val), "3" (flags), "4" (domid) : 15.42 "memory" ); 15.43 15.44 return ret;
16.1 --- a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c Mon Feb 07 08:19:24 2005 +0000 16.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c Tue Feb 08 16:44:16 2005 +0000 16.3 @@ -580,7 +580,7 @@ xennet_rx_push_buffer(struct xennet_soft 16.4 INVALID_P2M_ENTRY; 16.5 16.6 rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping; 16.7 - rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT; 16.8 + rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va; 16.9 rx_mcl[nr_pfns].args[1] = 0; 16.10 rx_mcl[nr_pfns].args[2] = 0; 16.11 16.12 @@ -679,7 +679,7 @@ xen_network_handler(void *arg) 16.13 mmu->val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT; 16.14 mmu++; 16.15 mcl->op = __HYPERVISOR_update_va_mapping; 16.16 - mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT; 16.17 + mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va; 16.18 mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW; 16.19 mcl->args[2] = UVMF_FLUSH_TLB; // 0; 16.20 mcl++; 16.21 @@ -872,7 +872,7 @@ network_alloc_rx_buffers(struct xennet_s 16.22 INVALID_P2M_ENTRY; 16.23 16.24 rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping; 16.25 - rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT; 16.26 + rx_mcl[nr_pfns].args[0] = va; 16.27 rx_mcl[nr_pfns].args[1] = 0; 16.28 rx_mcl[nr_pfns].args[2] = 0; 16.29
17.1 --- a/tools/examples/bochsrc Mon Feb 07 08:19:24 2005 +0000 17.2 +++ b/tools/examples/bochsrc Tue Feb 08 16:44:16 2005 +0000 17.3 @@ -3,10 +3,12 @@ 17.4 #vgaromimage: $BXSHARE/VGABIOS-lgpl-latest 17.5 floppya: 1_44=a.img, status=inserted 17.6 floppyb: 1_44=b.img, status=inserted 17.7 -#ata0-master: type=disk, path=minibootable.img, cylinders=900, heads=15, spt=17 17.8 # if you don't use absolute paths below, bochs looks under the cwd of xend, 17.9 # which is usually "/" 17.10 -ata0-master: type=disk, path=/tmp/min-fc2-i386.img, cylinders=800, heads=4, spt=32 17.11 +#ata0-master: type=disk, path=/var/images/min-el3-i386.img, cylinders=800, heads=4, spt=32 17.12 +i440fxsupport: enabled=1 17.13 +ne2k: ioaddr=0x300, irq=9, mac=b0:c4:22:01:00:00, ethmod=linux, ethdev=eth0 17.14 +ata0-master: type=disk, path=/var/images/1g-el3-i386.img, mode=flat, cylinders=2048, heads=16, spt=63 17.15 boot: c 17.16 17.17 log: /tmp/bochsout.txt 17.18 @@ -16,4 +18,3 @@ error: action=report 17.19 panic: action=ask 17.20 17.21 mouse: enabled=0 17.22 -ips: 1500000
18.1 --- a/tools/examples/vif-nat Mon Feb 07 08:19:24 2005 +0000 18.2 +++ b/tools/examples/vif-nat Tue Feb 08 16:44:16 2005 +0000 18.3 @@ -37,8 +37,8 @@ domain=${domain:?} 18.4 vif=${vif:?} 18.5 ip=${ip:?} 18.6 18.7 -# better way to strip /netmask from the ip? 18.8 -vif_ip=`echo ${ip} | awk -F. '{print $1"."$2"."$3"."$4}'` 18.9 +# strip /netmask 18.10 +vif_ip=`echo ${ip} | awk -F/ '{print $1}'` 18.11 18.12 main_ip=`ifconfig eth0 | grep "inet addr:" | sed -e 's/.*inet addr:\(\w\w*\.\w\w*\.\w\w*\.\w\w*\).*/\1/'` 18.13
19.1 --- a/tools/ioemu/include/config.h Mon Feb 07 08:19:24 2005 +0000 19.2 +++ b/tools/ioemu/include/config.h Tue Feb 08 16:44:16 2005 +0000 19.3 @@ -687,13 +687,13 @@ typedef 19.4 #define BX_NUM_SIMULATORS 1 19.5 19.6 // limited i440FX PCI support 19.7 -#define BX_PCI_SUPPORT 0 19.8 +#define BX_PCI_SUPPORT 1 19.9 19.10 // Experimental VGA on PCI 19.11 #define BX_PCI_VGA_SUPPORT 1 19.12 19.13 // limited USB on PCI 19.14 -#define BX_PCI_USB_SUPPORT 0 19.15 +#define BX_PCI_USB_SUPPORT 1 19.16 19.17 #if (BX_PCI_USB_SUPPORT && !BX_PCI_SUPPORT) 19.18 #error To enable USB, you must also enable PCI
20.1 --- a/tools/ioemu/include/pc_system.h Mon Feb 07 08:19:24 2005 +0000 20.2 +++ b/tools/ioemu/include/pc_system.h Tue Feb 08 16:44:16 2005 +0000 20.3 @@ -45,6 +45,13 @@ BOCHSAPI extern class bx_pc_system_c bx_ 20.4 extern double m_ips; 20.5 #endif 20.6 20.7 +#ifdef BX_USE_VMX 20.8 +extern unsigned int tsc_per_bx_tick; 20.9 + 20.10 +#define rdtscll(val) \ 20.11 + __asm__ __volatile__("rdtsc" : "=A" (val)) 20.12 +#endif 20.13 + 20.14 class BOCHSAPI bx_pc_system_c : private logfunctions { 20.15 private: 20.16 20.17 @@ -87,6 +94,26 @@ private: 20.18 double m_ips; // Millions of Instructions Per Second 20.19 #endif 20.20 20.21 +#ifdef BX_USE_VMX 20.22 + static Bit64s get_clock(void) { 20.23 + struct timeval tv; 20.24 + gettimeofday(&tv, NULL); 20.25 + return tv.tv_sec * 1000000LL + tv.tv_usec; 20.26 + } 20.27 + 20.28 + static Bit64u cpu_calibrate_ticks(void) { 20.29 + Bit64s usec, t1, t2; 20.30 + 20.31 + usec = get_clock(); 20.32 + rdtscll(t1); 20.33 + 20.34 + usleep(50 * 1000); 20.35 + usec = get_clock() - usec; 20.36 + rdtscll(t2); 20.37 + 20.38 + return (((t2 - t1) * 1000000LL + (usec >> 1)) / usec); 20.39 + } 20.40 +#endif 20.41 // This handler is called when the function which decrements the clock 20.42 // ticks finds that an event has occurred. 20.43 void countdownEvent(void);
21.1 --- a/tools/ioemu/iodev/cpu.cc Mon Feb 07 08:19:24 2005 +0000 21.2 +++ b/tools/ioemu/iodev/cpu.cc Tue Feb 08 16:44:16 2005 +0000 21.3 @@ -180,7 +180,8 @@ bx_cpu_c::cpu_loop(int max_instr_count) 21.4 FD_ZERO(&rfds); 21.5 21.6 while (1) { 21.7 - unsigned long t1, t2; 21.8 + static unsigned long long t1 = 0; 21.9 + unsigned long long t2; 21.10 21.11 /* Wait up to one seconds. */ 21.12 tv.tv_sec = 0; 21.13 @@ -188,18 +189,30 @@ bx_cpu_c::cpu_loop(int max_instr_count) 21.14 FD_SET(evtchn_fd, &rfds); 21.15 21.16 send_event = 0; 21.17 - rdtscl(t1); 21.18 + 21.19 + if (t1 == 0) // the first time 21.20 + rdtscll(t1); 21.21 + 21.22 retval = select(evtchn_fd+1, &rfds, NULL, NULL, &tv); 21.23 - rdtscl(t2); 21.24 if (retval == -1) { 21.25 perror("select"); 21.26 return; 21.27 } 21.28 - //stime_usec = 1000000 * (1 - tv.tv_sec) - tv.tv_usec; 21.29 - if (t2 > t1) 21.30 - BX_TICKN((t2 - t1) / 2000); // should match ips in bochsrc 21.31 + 21.32 + rdtscll(t2); 21.33 + 21.34 +#if __WORDSIZE == 32 21.35 +#define ULONGLONG_MAX 0xffffffffffffffffULL 21.36 +#else 21.37 +#define ULONGLONG_MAX ULONG_MAX 21.38 +#endif 21.39 + 21.40 + if (t2 <= t1) 21.41 + BX_TICKN((t2 + ULONGLONG_MAX - t1) / tsc_per_bx_tick); 21.42 else 21.43 - BX_TICKN((MAXINT - t1 + t2) / 2000); // should match ips in bochsrc 21.44 + BX_TICKN((t2 - t1) / tsc_per_bx_tick); 21.45 + t1 = t2; 21.46 + 21.47 timer_handler(); 21.48 if (BX_CPU_INTR) { 21.49 #if BX_SUPPORT_APIC 21.50 @@ -248,7 +261,7 @@ bx_cpu_c::interrupt(Bit8u vector) 21.51 // page. 21.52 21.53 rdtscl(tscl); 21.54 - BX_INFO(("%lx: injecting vector: %x\n", tscl, vector)); 21.55 + BX_DEBUG(("%lx: injecting vector: %x\n", tscl, vector)); 21.56 intr = &(((vcpu_iodata_t *) shared_page)->vp_intr[0]); 21.57 set_bit(vector, intr); 21.58
22.1 --- a/tools/ioemu/iodev/pc_system.cc Mon Feb 07 08:19:24 2005 +0000 22.2 +++ b/tools/ioemu/iodev/pc_system.cc Tue Feb 08 16:44:16 2005 +0000 22.3 @@ -44,6 +44,10 @@ unsigned long ips_count=0; 22.4 double m_ips; // Millions of Instructions Per Second 22.5 #endif 22.6 22.7 +#ifdef BX_USE_VMX 22.8 +unsigned int tsc_per_bx_tick; 22.9 +#endif 22.10 + 22.11 // Option for turning off BX_TIMER_DEBUG? 22.12 // Check out m_ips and ips 22.13 22.14 @@ -98,6 +102,16 @@ bx_pc_system_c::init_ips(Bit32u ips) 22.15 a20_mask = 0xffffffff; 22.16 #endif 22.17 22.18 +#ifdef BX_USE_VMX 22.19 + Bit64u phy_cpu_freq = cpu_calibrate_ticks(); 22.20 + 22.21 + if (ips == 500000) { //default ips: we use fixed scaling factor to calulate ips 22.22 + tsc_per_bx_tick = 2000; 22.23 + ips = phy_cpu_freq / tsc_per_bx_tick; 22.24 + } else //use uesr defined ips to calulate factor 22.25 + tsc_per_bx_tick = ((phy_cpu_freq + (ips>>1)) / ips); 22.26 +#endif 22.27 + 22.28 // parameter 'ips' is the processor speed in Instructions-Per-Second 22.29 m_ips = double(ips) / 1000000.0L; 22.30
23.1 --- a/tools/libxc/xc_linux_build.c Mon Feb 07 08:19:24 2005 +0000 23.2 +++ b/tools/libxc/xc_linux_build.c Tue Feb 08 16:44:16 2005 +0000 23.3 @@ -558,10 +558,10 @@ static int parseelfimage(char *elfbase, 23.4 phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize)); 23.5 if ( !is_loadable_phdr(phdr) ) 23.6 continue; 23.7 - if ( phdr->p_vaddr < kernstart ) 23.8 - kernstart = phdr->p_vaddr; 23.9 - if ( (phdr->p_vaddr + phdr->p_memsz) > kernend ) 23.10 - kernend = phdr->p_vaddr + phdr->p_memsz; 23.11 + if ( phdr->p_paddr < kernstart ) 23.12 + kernstart = phdr->p_paddr; 23.13 + if ( (phdr->p_paddr + phdr->p_memsz) > kernend ) 23.14 + kernend = phdr->p_paddr + phdr->p_memsz; 23.15 } 23.16 23.17 if ( (kernstart > kernend) || 23.18 @@ -611,7 +611,7 @@ loadelfimage( 23.19 23.20 for ( done = 0; done < phdr->p_filesz; done += chunksz ) 23.21 { 23.22 - pa = (phdr->p_vaddr + done) - vstart; 23.23 + pa = (phdr->p_paddr + done) - vstart; 23.24 va = xc_map_foreign_range( 23.25 xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]); 23.26 chunksz = phdr->p_filesz - done; 23.27 @@ -624,7 +624,7 @@ loadelfimage( 23.28 23.29 for ( ; done < phdr->p_memsz; done += chunksz ) 23.30 { 23.31 - pa = (phdr->p_vaddr + done) - vstart; 23.32 + pa = (phdr->p_paddr + done) - vstart; 23.33 va = xc_map_foreign_range( 23.34 xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]); 23.35 chunksz = phdr->p_memsz - done;
24.1 --- a/tools/libxc/xc_vmx_build.c Mon Feb 07 08:19:24 2005 +0000 24.2 +++ b/tools/libxc/xc_vmx_build.c Tue Feb 08 16:44:16 2005 +0000 24.3 @@ -629,10 +629,10 @@ static int parseelfimage(char *elfbase, 24.4 phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize)); 24.5 if ( !is_loadable_phdr(phdr) ) 24.6 continue; 24.7 - if ( phdr->p_vaddr < kernstart ) 24.8 - kernstart = phdr->p_vaddr; 24.9 - if ( (phdr->p_vaddr + phdr->p_memsz) > kernend ) 24.10 - kernend = phdr->p_vaddr + phdr->p_memsz; 24.11 + if ( phdr->p_paddr < kernstart ) 24.12 + kernstart = phdr->p_paddr; 24.13 + if ( (phdr->p_paddr + phdr->p_memsz) > kernend ) 24.14 + kernend = phdr->p_paddr + phdr->p_memsz; 24.15 } 24.16 24.17 if ( (kernstart > kernend) || 24.18 @@ -676,7 +676,7 @@ loadelfimage( 24.19 24.20 for ( done = 0; done < phdr->p_filesz; done += chunksz ) 24.21 { 24.22 - pa = (phdr->p_vaddr + done) - vstart - LINUX_PAGE_OFFSET; 24.23 + pa = (phdr->p_paddr + done) - vstart - LINUX_PAGE_OFFSET; 24.24 va = xc_map_foreign_range( 24.25 xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]); 24.26 chunksz = phdr->p_filesz - done; 24.27 @@ -689,7 +689,7 @@ loadelfimage( 24.28 24.29 for ( ; done < phdr->p_memsz; done += chunksz ) 24.30 { 24.31 - pa = (phdr->p_vaddr + done) - vstart - LINUX_PAGE_OFFSET; 24.32 + pa = (phdr->p_paddr + done) - vstart - LINUX_PAGE_OFFSET; 24.33 va = xc_map_foreign_range( 24.34 xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]); 24.35 chunksz = phdr->p_memsz - done;
25.1 --- a/tools/python/xen/xend/XendDomainInfo.py Mon Feb 07 08:19:24 2005 +0000 25.2 +++ b/tools/python/xen/xend/XendDomainInfo.py Tue Feb 08 16:44:16 2005 +0000 25.3 @@ -1337,6 +1337,7 @@ add_config_handler('memory', vm_fiel 25.4 add_config_handler('cpu', vm_field_ignore) 25.5 add_config_handler('cpu_weight', vm_field_ignore) 25.6 add_config_handler('console', vm_field_ignore) 25.7 +add_config_handler('restart', vm_field_ignore) 25.8 add_config_handler('image', vm_field_ignore) 25.9 add_config_handler('device', vm_field_ignore) 25.10 add_config_handler('backend', vm_field_ignore)
26.1 --- a/tools/python/xen/xend/server/SrvDaemon.py Mon Feb 07 08:19:24 2005 +0000 26.2 +++ b/tools/python/xen/xend/server/SrvDaemon.py Tue Feb 08 16:44:16 2005 +0000 26.3 @@ -486,10 +486,12 @@ class Daemon: 26.4 # XXX KAF: Why doesn't this capture output from C extensions that 26.5 # fprintf(stdout) or fprintf(stderr) ?? 26.6 os.open('/var/log/xend-debug.log', os.O_WRONLY|os.O_CREAT) 26.7 + os.dup(1) 26.8 else: 26.9 os.open('/dev/null', os.O_RDWR) 26.10 os.dup(0) 26.11 - os.dup(1) 26.12 + os.open('/var/log/xend-debug.log', os.O_WRONLY|os.O_CREAT) 26.13 + 26.14 26.15 def start(self, trace=0): 26.16 """Attempts to start the daemons.
27.1 --- a/xen/arch/x86/boot/mkelf32.c Mon Feb 07 08:19:24 2005 +0000 27.2 +++ b/xen/arch/x86/boot/mkelf32.c Tue Feb 08 16:44:16 2005 +0000 27.3 @@ -245,6 +245,12 @@ int main(int argc, char **argv) 27.4 return 1; 27.5 } 27.6 27.7 + /* 27.8 + * End the image on a page boundary. This gets round alignment bugs 27.9 + * in the boot- or chain-loader (e.g., kexec on the XenoBoot CD). 27.10 + */ 27.11 + mem_siz += -(loadbase + mem_siz) & 0xfff; 27.12 + 27.13 out_ehdr.e_entry = loadbase; 27.14 out_ehdr.e_shoff = RAW_OFFSET + dat_siz; 27.15
28.1 --- a/xen/arch/x86/boot/x86_32.S Mon Feb 07 08:19:24 2005 +0000 28.2 +++ b/xen/arch/x86/boot/x86_32.S Tue Feb 08 16:44:16 2005 +0000 28.3 @@ -214,7 +214,7 @@ ENTRY(gdt_table) 28.4 .org 0x1000 28.5 ENTRY(idle_pg_table) # Initial page directory is 4kB 28.6 .org 0x2000 28.7 -ENTRY(cpu0_stack) # Initial stack is 8kB 28.8 - .org 0x4000 28.9 +ENTRY(cpu0_stack) 28.10 + .org 0x2000 + STACK_SIZE 28.11 ENTRY(stext) 28.12 ENTRY(_stext)
29.1 --- a/xen/arch/x86/boot/x86_64.S Mon Feb 07 08:19:24 2005 +0000 29.2 +++ b/xen/arch/x86/boot/x86_64.S Tue Feb 08 16:44:16 2005 +0000 29.3 @@ -193,8 +193,8 @@ ENTRY(gdt_table) 29.4 .quad 0x00af9a000000ffff /* 0x0810 ring 0 code, 64-bit mode */ 29.5 .quad 0x00cf92000000ffff /* 0x0818 ring 0 data */ 29.6 .quad 0x00cffa000000ffff /* 0x0823 ring 3 code, compatibility */ 29.7 - .quad 0x00affa000000ffff /* 0x082b ring 3 code, 64-bit mode */ 29.8 - .quad 0x00cff2000000ffff /* 0x0833 ring 3 data */ 29.9 + .quad 0x00cff2000000ffff /* 0x082b ring 3 data */ 29.10 + .quad 0x00affa000000ffff /* 0x0833 ring 3 code, 64-bit mode */ 29.11 .quad 0x0000000000000000 /* unused */ 29.12 .fill 4*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ 29.13 29.14 @@ -243,8 +243,8 @@ ENTRY(idle_pg_table_l2) 29.15 identmap /* Too orangey for crows :-) */ 29.16 29.17 .org 0x4000 29.18 -ENTRY(cpu0_stack) # Initial stack is 8kB 29.19 +ENTRY(cpu0_stack) 29.20 29.21 - .org 0x6000 29.22 + .org 0x4000 + STACK_SIZE 29.23 ENTRY(stext) 29.24 ENTRY(_stext)
30.1 --- a/xen/arch/x86/dom0_ops.c Mon Feb 07 08:19:24 2005 +0000 30.2 +++ b/xen/arch/x86/dom0_ops.c Tue Feb 08 16:44:16 2005 +0000 30.3 @@ -376,7 +376,7 @@ void arch_getdomaininfo_ctxt( 30.4 { 30.5 for ( i = 0; i < 16; i++ ) 30.6 c->gdt_frames[i] = 30.7 - l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i]); 30.8 + l1_pgentry_to_pfn(ed->arch.perdomain_ptes[i]); 30.9 c->gdt_ents = GET_GDT_ENTRIES(ed); 30.10 } 30.11 c->guestos_ss = ed->arch.guestos_ss;
31.1 --- a/xen/arch/x86/domain.c Mon Feb 07 08:19:24 2005 +0000 31.2 +++ b/xen/arch/x86/domain.c Tue Feb 08 16:44:16 2005 +0000 31.3 @@ -304,7 +304,7 @@ void arch_vmx_do_launch(struct exec_doma 31.4 static void monitor_mk_pagetable(struct exec_domain *ed) 31.5 { 31.6 unsigned long mpfn; 31.7 - l2_pgentry_t *mpl2e; 31.8 + l2_pgentry_t *mpl2e, *phys_table; 31.9 struct pfn_info *mpfn_info; 31.10 struct domain *d = ed->domain; 31.11 31.12 @@ -312,20 +312,26 @@ static void monitor_mk_pagetable(struct 31.13 ASSERT( mpfn_info ); 31.14 31.15 mpfn = (unsigned long) (mpfn_info - frame_table); 31.16 - mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << L1_PAGETABLE_SHIFT); 31.17 + mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << PAGE_SHIFT); 31.18 memset(mpl2e, 0, PAGE_SIZE); 31.19 31.20 memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 31.21 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 31.22 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); 31.23 31.24 - ed->arch.monitor_table = mk_pagetable(mpfn << L1_PAGETABLE_SHIFT); 31.25 + ed->arch.monitor_table = mk_pagetable(mpfn << PAGE_SHIFT); 31.26 d->arch.shadow_mode = SHM_full_32; 31.27 31.28 mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = 31.29 mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) 31.30 | __PAGE_HYPERVISOR); 31.31 31.32 + phys_table = (l2_pgentry_t *) map_domain_mem(pagetable_val( 31.33 + ed->arch.phys_table)); 31.34 + memcpy(d->arch.mm_perdomain_pt, phys_table, 31.35 + ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t)); 31.36 + 31.37 + unmap_domain_mem(phys_table); 31.38 unmap_domain_mem(mpl2e); 31.39 } 31.40 31.41 @@ -466,6 +472,7 @@ int arch_final_setup_guestos( 31.42 31.43 phys_basetab = c->pt_base; 31.44 d->arch.pagetable = mk_pagetable(phys_basetab); 31.45 + d->arch.phys_table = d->arch.pagetable; 31.46 if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain, 31.47 PGT_base_page_table) ) 31.48 return -EINVAL; 31.49 @@ -628,12 +635,11 @@ long do_iopl(domid_t domain, unsigned in 31.50 return 0; 31.51 } 31.52 31.53 -unsigned long hypercall_create_continuation( 31.54 +unsigned long __hypercall_create_continuation( 31.55 unsigned int op, unsigned int nr_args, ...) 31.56 { 31.57 struct mc_state *mcs = &mc_state[smp_processor_id()]; 31.58 execution_context_t *ec; 31.59 - unsigned long *preg; 31.60 unsigned int i; 31.61 va_list args; 31.62 31.63 @@ -653,10 +659,34 @@ unsigned long hypercall_create_continuat 31.64 ec->eax = op; 31.65 ec->eip -= 2; /* re-execute 'int 0x82' */ 31.66 31.67 - for ( i = 0, preg = &ec->ebx; i < nr_args; i++, preg++ ) 31.68 - *preg = va_arg(args, unsigned long); 31.69 -#else 31.70 - preg = NULL; /* XXX x86/64 */ 31.71 + for ( i = 0; i < nr_args; i++ ) 31.72 + { 31.73 + switch ( i ) 31.74 + { 31.75 + case 0: ec->ebx = va_arg(args, unsigned long); break; 31.76 + case 1: ec->ecx = va_arg(args, unsigned long); break; 31.77 + case 2: ec->edx = va_arg(args, unsigned long); break; 31.78 + case 3: ec->esi = va_arg(args, unsigned long); break; 31.79 + case 4: ec->edi = va_arg(args, unsigned long); break; 31.80 + case 5: ec->ebp = va_arg(args, unsigned long); break; 31.81 + } 31.82 + } 31.83 +#elif defined(__x86_64__) 31.84 + ec->rax = op; 31.85 + ec->rip -= 2; /* re-execute 'syscall' */ 31.86 + 31.87 + for ( i = 0; i < nr_args; i++ ) 31.88 + { 31.89 + switch ( i ) 31.90 + { 31.91 + case 0: ec->rdi = va_arg(args, unsigned long); break; 31.92 + case 1: ec->rsi = va_arg(args, unsigned long); break; 31.93 + case 2: ec->rdx = va_arg(args, unsigned long); break; 31.94 + case 3: ec->r10 = va_arg(args, unsigned long); break; 31.95 + case 4: ec->r8 = va_arg(args, unsigned long); break; 31.96 + case 5: ec->r9 = va_arg(args, unsigned long); break; 31.97 + } 31.98 + } 31.99 #endif 31.100 } 31.101 31.102 @@ -726,8 +756,6 @@ static void relinquish_list(struct domai 31.103 #ifdef CONFIG_VMX 31.104 static void vmx_domain_relinquish_memory(struct exec_domain *ed) 31.105 { 31.106 - struct domain *d = ed->domain; 31.107 - 31.108 /* 31.109 * Free VMCS 31.110 */ 31.111 @@ -736,22 +764,6 @@ static void vmx_domain_relinquish_memory 31.112 ed->arch.arch_vmx.vmcs = 0; 31.113 31.114 monitor_rm_pagetable(ed); 31.115 - 31.116 - if (ed == d->exec_domain[0]) { 31.117 - int i; 31.118 - unsigned long pfn; 31.119 - 31.120 - for (i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++) { 31.121 - unsigned long l1e; 31.122 - 31.123 - l1e = l1_pgentry_val(d->arch.mm_perdomain_pt[i]); 31.124 - if (l1e & _PAGE_PRESENT) { 31.125 - pfn = l1e >> PAGE_SHIFT; 31.126 - free_domheap_page(&frame_table[pfn]); 31.127 - } 31.128 - } 31.129 - } 31.130 - 31.131 } 31.132 #endif 31.133
32.1 --- a/xen/arch/x86/memory.c Mon Feb 07 08:19:24 2005 +0000 32.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 32.3 @@ -1,2401 +0,0 @@ 32.4 -/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ 32.5 -/****************************************************************************** 32.6 - * arch/x86/memory.c 32.7 - * 32.8 - * Copyright (c) 2002-2004 K A Fraser 32.9 - * Copyright (c) 2004 Christian Limpach 32.10 - * 32.11 - * This program is free software; you can redistribute it and/or modify 32.12 - * it under the terms of the GNU General Public License as published by 32.13 - * the Free Software Foundation; either version 2 of the License, or 32.14 - * (at your option) any later version. 32.15 - * 32.16 - * This program is distributed in the hope that it will be useful, 32.17 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 32.18 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 32.19 - * GNU General Public License for more details. 32.20 - * 32.21 - * You should have received a copy of the GNU General Public License 32.22 - * along with this program; if not, write to the Free Software 32.23 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 32.24 - */ 32.25 - 32.26 -/* 32.27 - * A description of the x86 page table API: 32.28 - * 32.29 - * Domains trap to do_mmu_update with a list of update requests. 32.30 - * This is a list of (ptr, val) pairs, where the requested operation 32.31 - * is *ptr = val. 32.32 - * 32.33 - * Reference counting of pages: 32.34 - * ---------------------------- 32.35 - * Each page has two refcounts: tot_count and type_count. 32.36 - * 32.37 - * TOT_COUNT is the obvious reference count. It counts all uses of a 32.38 - * physical page frame by a domain, including uses as a page directory, 32.39 - * a page table, or simple mappings via a PTE. This count prevents a 32.40 - * domain from releasing a frame back to the free pool when it still holds 32.41 - * a reference to it. 32.42 - * 32.43 - * TYPE_COUNT is more subtle. A frame can be put to one of three 32.44 - * mutually-exclusive uses: it might be used as a page directory, or a 32.45 - * page table, or it may be mapped writable by the domain [of course, a 32.46 - * frame may not be used in any of these three ways!]. 32.47 - * So, type_count is a count of the number of times a frame is being 32.48 - * referred to in its current incarnation. Therefore, a page can only 32.49 - * change its type when its type count is zero. 32.50 - * 32.51 - * Pinning the page type: 32.52 - * ---------------------- 32.53 - * The type of a page can be pinned/unpinned with the commands 32.54 - * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, 32.55 - * pinning is not reference counted, so it can't be nested). 32.56 - * This is useful to prevent a page's type count falling to zero, at which 32.57 - * point safety checks would need to be carried out next time the count 32.58 - * is increased again. 32.59 - * 32.60 - * A further note on writable page mappings: 32.61 - * ----------------------------------------- 32.62 - * For simplicity, the count of writable mappings for a page may not 32.63 - * correspond to reality. The 'writable count' is incremented for every 32.64 - * PTE which maps the page with the _PAGE_RW flag set. However, for 32.65 - * write access to be possible the page directory entry must also have 32.66 - * its _PAGE_RW bit set. We do not check this as it complicates the 32.67 - * reference counting considerably [consider the case of multiple 32.68 - * directory entries referencing a single page table, some with the RW 32.69 - * bit set, others not -- it starts getting a bit messy]. 32.70 - * In normal use, this simplification shouldn't be a problem. 32.71 - * However, the logic can be added if required. 32.72 - * 32.73 - * One more note on read-only page mappings: 32.74 - * ----------------------------------------- 32.75 - * We want domains to be able to map pages for read-only access. The 32.76 - * main reason is that page tables and directories should be readable 32.77 - * by a domain, but it would not be safe for them to be writable. 32.78 - * However, domains have free access to rings 1 & 2 of the Intel 32.79 - * privilege model. In terms of page protection, these are considered 32.80 - * to be part of 'supervisor mode'. The WP bit in CR0 controls whether 32.81 - * read-only restrictions are respected in supervisor mode -- if the 32.82 - * bit is clear then any mapped page is writable. 32.83 - * 32.84 - * We get round this by always setting the WP bit and disallowing 32.85 - * updates to it. This is very unlikely to cause a problem for guest 32.86 - * OS's, which will generally use the WP bit to simplify copy-on-write 32.87 - * implementation (in that case, OS wants a fault when it writes to 32.88 - * an application-supplied buffer). 32.89 - */ 32.90 - 32.91 -#include <xen/config.h> 32.92 -#include <xen/init.h> 32.93 -#include <xen/kernel.h> 32.94 -#include <xen/lib.h> 32.95 -#include <xen/mm.h> 32.96 -#include <xen/sched.h> 32.97 -#include <xen/errno.h> 32.98 -#include <xen/perfc.h> 32.99 -#include <xen/irq.h> 32.100 -#include <xen/softirq.h> 32.101 -#include <asm/shadow.h> 32.102 -#include <asm/page.h> 32.103 -#include <asm/flushtlb.h> 32.104 -#include <asm/io.h> 32.105 -#include <asm/uaccess.h> 32.106 -#include <asm/domain_page.h> 32.107 -#include <asm/ldt.h> 32.108 - 32.109 -#ifdef VERBOSE 32.110 -#define MEM_LOG(_f, _a...) \ 32.111 - printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \ 32.112 - current->domain->id , __LINE__ , ## _a ) 32.113 -#else 32.114 -#define MEM_LOG(_f, _a...) ((void)0) 32.115 -#endif 32.116 - 32.117 -static int alloc_l2_table(struct pfn_info *page); 32.118 -static int alloc_l1_table(struct pfn_info *page); 32.119 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d); 32.120 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 32.121 - u32 type, 32.122 - struct domain *d); 32.123 - 32.124 -static void free_l2_table(struct pfn_info *page); 32.125 -static void free_l1_table(struct pfn_info *page); 32.126 - 32.127 -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); 32.128 -static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); 32.129 - 32.130 -/* Used to defer flushing of memory structures. */ 32.131 -static struct { 32.132 -#define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */ 32.133 -#define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */ 32.134 - unsigned long deferred_ops; 32.135 - /* If non-NULL, specifies a foreign subject domain for some operations. */ 32.136 - struct domain *foreign; 32.137 -} __cacheline_aligned percpu_info[NR_CPUS]; 32.138 - 32.139 -/* 32.140 - * Returns the current foreign domain; defaults to the currently-executing 32.141 - * domain if a foreign override hasn't been specified. 32.142 - */ 32.143 -#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain) 32.144 - 32.145 -/* Private domain structs for DOMID_XEN and DOMID_IO. */ 32.146 -static struct domain *dom_xen, *dom_io; 32.147 - 32.148 -/* Frame table and its size in pages. */ 32.149 -struct pfn_info *frame_table; 32.150 -unsigned long frame_table_size; 32.151 -unsigned long max_page; 32.152 - 32.153 -void __init init_frametable(void) 32.154 -{ 32.155 - unsigned long i, p; 32.156 - 32.157 - frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START; 32.158 - frame_table_size = max_page * sizeof(struct pfn_info); 32.159 - frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; 32.160 - 32.161 - for ( i = 0; i < frame_table_size; i += (4UL << 20) ) 32.162 - { 32.163 - p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20); 32.164 - if ( p == 0 ) 32.165 - panic("Not enough memory for frame table\n"); 32.166 - map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 32.167 - 4UL << 20, PAGE_HYPERVISOR); 32.168 - } 32.169 - 32.170 - memset(frame_table, 0, frame_table_size); 32.171 -} 32.172 - 32.173 -void arch_init_memory(void) 32.174 -{ 32.175 - extern void subarch_init_memory(struct domain *); 32.176 - 32.177 - memset(percpu_info, 0, sizeof(percpu_info)); 32.178 - 32.179 - /* 32.180 - * Initialise our DOMID_XEN domain. 32.181 - * Any Xen-heap pages that we will allow to be mapped will have 32.182 - * their domain field set to dom_xen. 32.183 - */ 32.184 - dom_xen = alloc_domain_struct(); 32.185 - atomic_set(&dom_xen->refcnt, 1); 32.186 - dom_xen->id = DOMID_XEN; 32.187 - 32.188 - /* 32.189 - * Initialise our DOMID_IO domain. 32.190 - * This domain owns no pages but is considered a special case when 32.191 - * mapping I/O pages, as the mappings occur at the priv of the caller. 32.192 - */ 32.193 - dom_io = alloc_domain_struct(); 32.194 - atomic_set(&dom_io->refcnt, 1); 32.195 - dom_io->id = DOMID_IO; 32.196 - 32.197 - subarch_init_memory(dom_xen); 32.198 -} 32.199 - 32.200 -void write_ptbase(struct exec_domain *ed) 32.201 -{ 32.202 - struct domain *d = ed->domain; 32.203 - unsigned long pa; 32.204 - 32.205 -#ifdef CONFIG_VMX 32.206 - if ( unlikely(d->arch.shadow_mode) ) 32.207 - pa = ((d->arch.shadow_mode == SHM_full_32) ? 32.208 - pagetable_val(ed->arch.monitor_table) : 32.209 - pagetable_val(ed->arch.shadow_table)); 32.210 - else 32.211 - pa = pagetable_val(ed->arch.pagetable); 32.212 -#else 32.213 - if ( unlikely(d->arch.shadow_mode) ) 32.214 - pa = pagetable_val(ed->arch.shadow_table); 32.215 - else 32.216 - pa = pagetable_val(ed->arch.pagetable); 32.217 -#endif 32.218 - 32.219 - write_cr3(pa); 32.220 -} 32.221 - 32.222 -static void __invalidate_shadow_ldt(struct exec_domain *d) 32.223 -{ 32.224 - int i; 32.225 - unsigned long pfn; 32.226 - struct pfn_info *page; 32.227 - 32.228 - d->arch.shadow_ldt_mapcnt = 0; 32.229 - 32.230 - for ( i = 16; i < 32; i++ ) 32.231 - { 32.232 - pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]); 32.233 - if ( pfn == 0 ) continue; 32.234 - d->arch.perdomain_ptes[i] = mk_l1_pgentry(0); 32.235 - page = &frame_table[pfn]; 32.236 - ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page); 32.237 - ASSERT_PAGE_IS_DOMAIN(page, d->domain); 32.238 - put_page_and_type(page); 32.239 - } 32.240 - 32.241 - /* Dispose of the (now possibly invalid) mappings from the TLB. */ 32.242 - percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT; 32.243 -} 32.244 - 32.245 - 32.246 -static inline void invalidate_shadow_ldt(struct exec_domain *d) 32.247 -{ 32.248 - if ( d->arch.shadow_ldt_mapcnt != 0 ) 32.249 - __invalidate_shadow_ldt(d); 32.250 -} 32.251 - 32.252 - 32.253 -static int alloc_segdesc_page(struct pfn_info *page) 32.254 -{ 32.255 - unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT); 32.256 - int i; 32.257 - 32.258 - for ( i = 0; i < 512; i++ ) 32.259 - if ( unlikely(!check_descriptor(&descs[i*2])) ) 32.260 - goto fail; 32.261 - 32.262 - unmap_domain_mem(descs); 32.263 - return 1; 32.264 - 32.265 - fail: 32.266 - unmap_domain_mem(descs); 32.267 - return 0; 32.268 -} 32.269 - 32.270 - 32.271 -/* Map shadow page at offset @off. */ 32.272 -int map_ldt_shadow_page(unsigned int off) 32.273 -{ 32.274 - struct exec_domain *ed = current; 32.275 - struct domain *d = ed->domain; 32.276 - unsigned long l1e; 32.277 - 32.278 - if ( unlikely(in_irq()) ) 32.279 - BUG(); 32.280 - 32.281 - __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->arch.ldt_base >> 32.282 - PAGE_SHIFT) + off]); 32.283 - 32.284 - if ( unlikely(!(l1e & _PAGE_PRESENT)) || 32.285 - unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 32.286 - d, PGT_ldt_page)) ) 32.287 - return 0; 32.288 - 32.289 - ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW); 32.290 - ed->arch.shadow_ldt_mapcnt++; 32.291 - 32.292 - return 1; 32.293 -} 32.294 - 32.295 - 32.296 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d) 32.297 -{ 32.298 - struct pfn_info *page = &frame_table[page_nr]; 32.299 - 32.300 - if ( unlikely(!pfn_is_ram(page_nr)) ) 32.301 - { 32.302 - MEM_LOG("Pfn %08lx is not RAM", page_nr); 32.303 - return 0; 32.304 - } 32.305 - 32.306 - if ( unlikely(!get_page(page, d)) ) 32.307 - { 32.308 - MEM_LOG("Could not get page ref for pfn %08lx", page_nr); 32.309 - return 0; 32.310 - } 32.311 - 32.312 - return 1; 32.313 -} 32.314 - 32.315 - 32.316 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 32.317 - u32 type, 32.318 - struct domain *d) 32.319 -{ 32.320 - struct pfn_info *page = &frame_table[page_nr]; 32.321 - 32.322 - if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) 32.323 - return 0; 32.324 - 32.325 - if ( unlikely(!get_page_type(page, type)) ) 32.326 - { 32.327 -#ifdef VERBOSE 32.328 - if ( (type & PGT_type_mask) != PGT_l1_page_table ) 32.329 - MEM_LOG("Bad page type for pfn %08lx (%08x)", 32.330 - page_nr, page->u.inuse.type_info); 32.331 -#endif 32.332 - put_page(page); 32.333 - return 0; 32.334 - } 32.335 - 32.336 - return 1; 32.337 -} 32.338 - 32.339 - 32.340 -/* 32.341 - * We allow an L2 tables to map each other (a.k.a. linear page tables). It 32.342 - * needs some special care with reference counst and access permissions: 32.343 - * 1. The mapping entry must be read-only, or the guest may get write access 32.344 - * to its own PTEs. 32.345 - * 2. We must only bump the reference counts for an *already validated* 32.346 - * L2 table, or we can end up in a deadlock in get_page_type() by waiting 32.347 - * on a validation that is required to complete that validation. 32.348 - * 3. We only need to increment the reference counts for the mapped page 32.349 - * frame if it is mapped by a different L2 table. This is sufficient and 32.350 - * also necessary to allow validation of an L2 table mapping itself. 32.351 - */ 32.352 -static int 32.353 -get_linear_pagetable( 32.354 - l2_pgentry_t l2e, unsigned long pfn, struct domain *d) 32.355 -{ 32.356 - u32 x, y; 32.357 - struct pfn_info *page; 32.358 - 32.359 - if ( (l2_pgentry_val(l2e) & _PAGE_RW) ) 32.360 - { 32.361 - MEM_LOG("Attempt to create linear p.t. with write perms"); 32.362 - return 0; 32.363 - } 32.364 - 32.365 - if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) 32.366 - { 32.367 - /* Make sure the mapped frame belongs to the correct domain. */ 32.368 - if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) ) 32.369 - return 0; 32.370 - 32.371 - /* 32.372 - * Make sure that the mapped frame is an already-validated L2 table. 32.373 - * If so, atomically increment the count (checking for overflow). 32.374 - */ 32.375 - page = &frame_table[l2_pgentry_to_pagenr(l2e)]; 32.376 - y = page->u.inuse.type_info; 32.377 - do { 32.378 - x = y; 32.379 - if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || 32.380 - unlikely((x & (PGT_type_mask|PGT_validated)) != 32.381 - (PGT_l2_page_table|PGT_validated)) ) 32.382 - { 32.383 - put_page(page); 32.384 - return 0; 32.385 - } 32.386 - } 32.387 - while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); 32.388 - } 32.389 - 32.390 - return 1; 32.391 -} 32.392 - 32.393 - 32.394 -static int 32.395 -get_page_from_l1e( 32.396 - l1_pgentry_t l1e, struct domain *d) 32.397 -{ 32.398 - unsigned long l1v = l1_pgentry_val(l1e); 32.399 - unsigned long pfn = l1_pgentry_to_pagenr(l1e); 32.400 - struct pfn_info *page = &frame_table[pfn]; 32.401 - extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn); 32.402 - 32.403 - if ( !(l1v & _PAGE_PRESENT) ) 32.404 - return 1; 32.405 - 32.406 - if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) ) 32.407 - { 32.408 - MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT)); 32.409 - return 0; 32.410 - } 32.411 - 32.412 - if ( unlikely(!pfn_is_ram(pfn)) ) 32.413 - { 32.414 - /* Revert to caller privileges if FD == DOMID_IO. */ 32.415 - if ( d == dom_io ) 32.416 - d = current->domain; 32.417 - 32.418 - if ( IS_PRIV(d) ) 32.419 - return 1; 32.420 - 32.421 - if ( IS_CAPABLE_PHYSDEV(d) ) 32.422 - return domain_iomem_in_pfn(d, pfn); 32.423 - 32.424 - MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn); 32.425 - return 0; 32.426 - } 32.427 - 32.428 - return ((l1v & _PAGE_RW) ? 32.429 - get_page_and_type(page, d, PGT_writable_page) : 32.430 - get_page(page, d)); 32.431 -} 32.432 - 32.433 - 32.434 -/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ 32.435 -static int 32.436 -get_page_from_l2e( 32.437 - l2_pgentry_t l2e, unsigned long pfn, 32.438 - struct domain *d, unsigned long va_idx) 32.439 -{ 32.440 - int rc; 32.441 - 32.442 - if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) 32.443 - return 1; 32.444 - 32.445 - if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) ) 32.446 - { 32.447 - MEM_LOG("Bad L2 page type settings %04lx", 32.448 - l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE)); 32.449 - return 0; 32.450 - } 32.451 - 32.452 - rc = get_page_and_type_from_pagenr( 32.453 - l2_pgentry_to_pagenr(l2e), 32.454 - PGT_l1_page_table | (va_idx<<PGT_va_shift), d); 32.455 - 32.456 - if ( unlikely(!rc) ) 32.457 - return get_linear_pagetable(l2e, pfn, d); 32.458 - 32.459 - return 1; 32.460 -} 32.461 - 32.462 - 32.463 -static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) 32.464 -{ 32.465 - unsigned long l1v = l1_pgentry_val(l1e); 32.466 - unsigned long pfn = l1_pgentry_to_pagenr(l1e); 32.467 - struct pfn_info *page = &frame_table[pfn]; 32.468 - struct domain *e; 32.469 - 32.470 - if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) ) 32.471 - return; 32.472 - 32.473 - e = page_get_owner(page); 32.474 - if ( unlikely(e != d) ) 32.475 - { 32.476 - /* 32.477 - * Unmap a foreign page that may have been mapped via a grant table. 32.478 - * Note that this can fail for a privileged domain that can map foreign 32.479 - * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings 32.480 - * counted via a grant entry and some counted directly in the page 32.481 - * structure's reference count. Note that reference counts won't get 32.482 - * dangerously confused as long as we always try to decrement the 32.483 - * grant entry first. We may end up with a mismatch between which 32.484 - * mappings and which unmappings are counted via the grant entry, but 32.485 - * really it doesn't matter as privileged domains have carte blanche. 32.486 - */ 32.487 - if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) ) 32.488 - return; 32.489 - /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */ 32.490 - } 32.491 - 32.492 - if ( l1v & _PAGE_RW ) 32.493 - { 32.494 - put_page_and_type(page); 32.495 - } 32.496 - else 32.497 - { 32.498 - /* We expect this is rare so we blow the entire shadow LDT. */ 32.499 - if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 32.500 - PGT_ldt_page)) && 32.501 - unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) ) 32.502 - invalidate_shadow_ldt(e->exec_domain[0]); 32.503 - put_page(page); 32.504 - } 32.505 -} 32.506 - 32.507 - 32.508 -/* 32.509 - * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. 32.510 - * Note also that this automatically deals correctly with linear p.t.'s. 32.511 - */ 32.512 -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) 32.513 -{ 32.514 - if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 32.515 - ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) ) 32.516 - put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]); 32.517 -} 32.518 - 32.519 - 32.520 -static int alloc_l2_table(struct pfn_info *page) 32.521 -{ 32.522 - struct domain *d = page_get_owner(page); 32.523 - unsigned long page_nr = page_to_pfn(page); 32.524 - l2_pgentry_t *pl2e; 32.525 - int i; 32.526 - 32.527 - pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 32.528 - 32.529 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 32.530 - if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) ) 32.531 - goto fail; 32.532 - 32.533 -#if defined(__i386__) 32.534 - /* Now we add our private high mappings. */ 32.535 - memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 32.536 - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 32.537 - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); 32.538 - pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = 32.539 - mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); 32.540 - pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = 32.541 - mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 32.542 - __PAGE_HYPERVISOR); 32.543 -#endif 32.544 - 32.545 - unmap_domain_mem(pl2e); 32.546 - return 1; 32.547 - 32.548 - fail: 32.549 - while ( i-- > 0 ) 32.550 - put_page_from_l2e(pl2e[i], page_nr); 32.551 - 32.552 - unmap_domain_mem(pl2e); 32.553 - return 0; 32.554 -} 32.555 - 32.556 - 32.557 -static int alloc_l1_table(struct pfn_info *page) 32.558 -{ 32.559 - struct domain *d = page_get_owner(page); 32.560 - unsigned long page_nr = page_to_pfn(page); 32.561 - l1_pgentry_t *pl1e; 32.562 - int i; 32.563 - 32.564 - pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 32.565 - 32.566 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 32.567 - if ( unlikely(!get_page_from_l1e(pl1e[i], d)) ) 32.568 - goto fail; 32.569 - 32.570 - unmap_domain_mem(pl1e); 32.571 - return 1; 32.572 - 32.573 - fail: 32.574 - while ( i-- > 0 ) 32.575 - put_page_from_l1e(pl1e[i], d); 32.576 - 32.577 - unmap_domain_mem(pl1e); 32.578 - return 0; 32.579 -} 32.580 - 32.581 - 32.582 -static void free_l2_table(struct pfn_info *page) 32.583 -{ 32.584 - unsigned long page_nr = page - frame_table; 32.585 - l2_pgentry_t *pl2e; 32.586 - int i; 32.587 - 32.588 - pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 32.589 - 32.590 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 32.591 - put_page_from_l2e(pl2e[i], page_nr); 32.592 - 32.593 - unmap_domain_mem(pl2e); 32.594 -} 32.595 - 32.596 - 32.597 -static void free_l1_table(struct pfn_info *page) 32.598 -{ 32.599 - struct domain *d = page_get_owner(page); 32.600 - unsigned long page_nr = page - frame_table; 32.601 - l1_pgentry_t *pl1e; 32.602 - int i; 32.603 - 32.604 - pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 32.605 - 32.606 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 32.607 - put_page_from_l1e(pl1e[i], d); 32.608 - 32.609 - unmap_domain_mem(pl1e); 32.610 -} 32.611 - 32.612 - 32.613 -static inline int update_l2e(l2_pgentry_t *pl2e, 32.614 - l2_pgentry_t ol2e, 32.615 - l2_pgentry_t nl2e) 32.616 -{ 32.617 - unsigned long o = cmpxchg((unsigned long *)pl2e, 32.618 - l2_pgentry_val(ol2e), 32.619 - l2_pgentry_val(nl2e)); 32.620 - if ( o != l2_pgentry_val(ol2e) ) 32.621 - MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 32.622 - l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o); 32.623 - return (o == l2_pgentry_val(ol2e)); 32.624 -} 32.625 - 32.626 - 32.627 -/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ 32.628 -static int mod_l2_entry(l2_pgentry_t *pl2e, 32.629 - l2_pgentry_t nl2e, 32.630 - unsigned long pfn) 32.631 -{ 32.632 - l2_pgentry_t ol2e; 32.633 - unsigned long _ol2e; 32.634 - 32.635 - if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >= 32.636 - DOMAIN_ENTRIES_PER_L2_PAGETABLE) ) 32.637 - { 32.638 - MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e); 32.639 - return 0; 32.640 - } 32.641 - 32.642 - if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) ) 32.643 - return 0; 32.644 - ol2e = mk_l2_pgentry(_ol2e); 32.645 - 32.646 - if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT ) 32.647 - { 32.648 - /* Differ in mapping (bits 12-31) or presence (bit 0)? */ 32.649 - if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 ) 32.650 - return update_l2e(pl2e, ol2e, nl2e); 32.651 - 32.652 - if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, 32.653 - ((unsigned long)pl2e & 32.654 - ~PAGE_MASK) >> 2)) ) 32.655 - return 0; 32.656 - 32.657 - if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 32.658 - { 32.659 - put_page_from_l2e(nl2e, pfn); 32.660 - return 0; 32.661 - } 32.662 - 32.663 - put_page_from_l2e(ol2e, pfn); 32.664 - return 1; 32.665 - } 32.666 - 32.667 - if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 32.668 - return 0; 32.669 - 32.670 - put_page_from_l2e(ol2e, pfn); 32.671 - return 1; 32.672 -} 32.673 - 32.674 - 32.675 -static inline int update_l1e(l1_pgentry_t *pl1e, 32.676 - l1_pgentry_t ol1e, 32.677 - l1_pgentry_t nl1e) 32.678 -{ 32.679 - unsigned long o = l1_pgentry_val(ol1e); 32.680 - unsigned long n = l1_pgentry_val(nl1e); 32.681 - 32.682 - if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) || 32.683 - unlikely(o != l1_pgentry_val(ol1e)) ) 32.684 - { 32.685 - MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 32.686 - l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o); 32.687 - return 0; 32.688 - } 32.689 - 32.690 - return 1; 32.691 -} 32.692 - 32.693 - 32.694 -/* Update the L1 entry at pl1e to new value nl1e. */ 32.695 -static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) 32.696 -{ 32.697 - l1_pgentry_t ol1e; 32.698 - unsigned long _ol1e; 32.699 - struct domain *d = current->domain; 32.700 - 32.701 - if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) 32.702 - { 32.703 - MEM_LOG("Bad get_user\n"); 32.704 - return 0; 32.705 - } 32.706 - 32.707 - ol1e = mk_l1_pgentry(_ol1e); 32.708 - 32.709 - if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT ) 32.710 - { 32.711 - /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */ 32.712 - if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 ) 32.713 - return update_l1e(pl1e, ol1e, nl1e); 32.714 - 32.715 - if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) 32.716 - return 0; 32.717 - 32.718 - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 32.719 - { 32.720 - put_page_from_l1e(nl1e, d); 32.721 - return 0; 32.722 - } 32.723 - 32.724 - put_page_from_l1e(ol1e, d); 32.725 - return 1; 32.726 - } 32.727 - 32.728 - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 32.729 - return 0; 32.730 - 32.731 - put_page_from_l1e(ol1e, d); 32.732 - return 1; 32.733 -} 32.734 - 32.735 - 32.736 -int alloc_page_type(struct pfn_info *page, unsigned int type) 32.737 -{ 32.738 - switch ( type ) 32.739 - { 32.740 - case PGT_l1_page_table: 32.741 - return alloc_l1_table(page); 32.742 - case PGT_l2_page_table: 32.743 - return alloc_l2_table(page); 32.744 - case PGT_gdt_page: 32.745 - case PGT_ldt_page: 32.746 - return alloc_segdesc_page(page); 32.747 - default: 32.748 - printk("Bad type in alloc_page_type %x t=%x c=%x\n", 32.749 - type, page->u.inuse.type_info, 32.750 - page->count_info); 32.751 - BUG(); 32.752 - } 32.753 - 32.754 - return 0; 32.755 -} 32.756 - 32.757 - 32.758 -void free_page_type(struct pfn_info *page, unsigned int type) 32.759 -{ 32.760 - struct domain *d = page_get_owner(page); 32.761 - 32.762 - switch ( type ) 32.763 - { 32.764 - case PGT_l1_page_table: 32.765 - free_l1_table(page); 32.766 - break; 32.767 - 32.768 - case PGT_l2_page_table: 32.769 - free_l2_table(page); 32.770 - break; 32.771 - 32.772 - default: 32.773 - BUG(); 32.774 - } 32.775 - 32.776 - if ( unlikely(d->arch.shadow_mode) && 32.777 - (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) ) 32.778 - { 32.779 - unshadow_table(page_to_pfn(page), type); 32.780 - put_shadow_status(d); 32.781 - } 32.782 -} 32.783 - 32.784 - 32.785 -void put_page_type(struct pfn_info *page) 32.786 -{ 32.787 - u32 nx, x, y = page->u.inuse.type_info; 32.788 - 32.789 - again: 32.790 - do { 32.791 - x = y; 32.792 - nx = x - 1; 32.793 - 32.794 - ASSERT((x & PGT_count_mask) != 0); 32.795 - 32.796 - /* 32.797 - * The page should always be validated while a reference is held. The 32.798 - * exception is during domain destruction, when we forcibly invalidate 32.799 - * page-table pages if we detect a referential loop. 32.800 - * See domain.c:relinquish_list(). 32.801 - */ 32.802 - ASSERT((x & PGT_validated) || 32.803 - test_bit(DF_DYING, &page_get_owner(page)->d_flags)); 32.804 - 32.805 - if ( unlikely((nx & PGT_count_mask) == 0) ) 32.806 - { 32.807 - /* Record TLB information for flush later. Races are harmless. */ 32.808 - page->tlbflush_timestamp = tlbflush_current_time(); 32.809 - 32.810 - if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && 32.811 - likely(nx & PGT_validated) ) 32.812 - { 32.813 - /* 32.814 - * Page-table pages must be unvalidated when count is zero. The 32.815 - * 'free' is safe because the refcnt is non-zero and validated 32.816 - * bit is clear => other ops will spin or fail. 32.817 - */ 32.818 - if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 32.819 - x & ~PGT_validated)) != x) ) 32.820 - goto again; 32.821 - /* We cleared the 'valid bit' so we do the clear up. */ 32.822 - free_page_type(page, x & PGT_type_mask); 32.823 - /* Carry on, but with the 'valid bit' now clear. */ 32.824 - x &= ~PGT_validated; 32.825 - nx &= ~PGT_validated; 32.826 - } 32.827 - } 32.828 - else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 32.829 - (PGT_pinned | 1)) ) 32.830 - { 32.831 - /* Page is now only pinned. Make the back pointer mutable again. */ 32.832 - nx |= PGT_va_mutable; 32.833 - } 32.834 - } 32.835 - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 32.836 -} 32.837 - 32.838 - 32.839 -int get_page_type(struct pfn_info *page, u32 type) 32.840 -{ 32.841 - u32 nx, x, y = page->u.inuse.type_info; 32.842 - 32.843 - again: 32.844 - do { 32.845 - x = y; 32.846 - nx = x + 1; 32.847 - if ( unlikely((nx & PGT_count_mask) == 0) ) 32.848 - { 32.849 - MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page)); 32.850 - return 0; 32.851 - } 32.852 - else if ( unlikely((x & PGT_count_mask) == 0) ) 32.853 - { 32.854 - if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) 32.855 - { 32.856 - /* 32.857 - * On type change we check to flush stale TLB entries. This 32.858 - * may be unnecessary (e.g., page was GDT/LDT) but those 32.859 - * circumstances should be very rare. 32.860 - */ 32.861 - struct domain *d = page_get_owner(page); 32.862 - if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor], 32.863 - page->tlbflush_timestamp)) ) 32.864 - { 32.865 - perfc_incr(need_flush_tlb_flush); 32.866 - flush_tlb_cpu(d->exec_domain[0]->processor); 32.867 - } 32.868 - 32.869 - /* We lose existing type, back pointer, and validity. */ 32.870 - nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); 32.871 - nx |= type; 32.872 - 32.873 - /* No special validation needed for writable pages. */ 32.874 - /* Page tables and GDT/LDT need to be scanned for validity. */ 32.875 - if ( type == PGT_writable_page ) 32.876 - nx |= PGT_validated; 32.877 - } 32.878 - } 32.879 - else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) 32.880 - { 32.881 - if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) 32.882 - { 32.883 - if ( ((x & PGT_type_mask) != PGT_l2_page_table) || 32.884 - ((type & PGT_type_mask) != PGT_l1_page_table) ) 32.885 - MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n", 32.886 - x & PGT_type_mask, type, page_to_pfn(page)); 32.887 - return 0; 32.888 - } 32.889 - else if ( (x & PGT_va_mask) == PGT_va_mutable ) 32.890 - { 32.891 - /* The va backpointer is mutable, hence we update it. */ 32.892 - nx &= ~PGT_va_mask; 32.893 - nx |= type; /* we know the actual type is correct */ 32.894 - } 32.895 - else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) ) 32.896 - { 32.897 - /* This table is potentially mapped at multiple locations. */ 32.898 - nx &= ~PGT_va_mask; 32.899 - nx |= PGT_va_unknown; 32.900 - } 32.901 - } 32.902 - else if ( unlikely(!(x & PGT_validated)) ) 32.903 - { 32.904 - /* Someone else is updating validation of this page. Wait... */ 32.905 - while ( (y = page->u.inuse.type_info) == x ) 32.906 - { 32.907 - rep_nop(); 32.908 - barrier(); 32.909 - } 32.910 - goto again; 32.911 - } 32.912 - } 32.913 - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 32.914 - 32.915 - if ( unlikely(!(nx & PGT_validated)) ) 32.916 - { 32.917 - /* Try to validate page type; drop the new reference on failure. */ 32.918 - if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) 32.919 - { 32.920 - MEM_LOG("Error while validating pfn %08lx for type %08x." 32.921 - " caf=%08x taf=%08x\n", 32.922 - page_to_pfn(page), type, 32.923 - page->count_info, 32.924 - page->u.inuse.type_info); 32.925 - /* Noone else can get a reference. We hold the only ref. */ 32.926 - page->u.inuse.type_info = 0; 32.927 - return 0; 32.928 - } 32.929 - 32.930 - /* Noone else is updating simultaneously. */ 32.931 - __set_bit(_PGT_validated, &page->u.inuse.type_info); 32.932 - } 32.933 - 32.934 - return 1; 32.935 -} 32.936 - 32.937 - 32.938 -int new_guest_cr3(unsigned long pfn) 32.939 -{ 32.940 - struct exec_domain *ed = current; 32.941 - struct domain *d = ed->domain; 32.942 - int okay, cpu = smp_processor_id(); 32.943 - unsigned long old_base_pfn; 32.944 - 32.945 - okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d); 32.946 - if ( likely(okay) ) 32.947 - { 32.948 - invalidate_shadow_ldt(ed); 32.949 - 32.950 - percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB; 32.951 - old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT; 32.952 - ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT); 32.953 - 32.954 - shadow_mk_pagetable(ed); 32.955 - 32.956 - write_ptbase(ed); 32.957 - 32.958 - put_page_and_type(&frame_table[old_base_pfn]); 32.959 - } 32.960 - else 32.961 - { 32.962 - MEM_LOG("Error while installing new baseptr %08lx", pfn); 32.963 - } 32.964 - 32.965 - return okay; 32.966 -} 32.967 - 32.968 -static int do_extended_command(unsigned long ptr, unsigned long val) 32.969 -{ 32.970 - int okay = 1, cpu = smp_processor_id(); 32.971 - unsigned int cmd = val & MMUEXT_CMD_MASK; 32.972 - unsigned long pfn = ptr >> PAGE_SHIFT; 32.973 - struct pfn_info *page = &frame_table[pfn]; 32.974 - struct exec_domain *ed = current; 32.975 - struct domain *d = ed->domain, *nd, *e; 32.976 - u32 x, y; 32.977 - domid_t domid; 32.978 - grant_ref_t gntref; 32.979 - 32.980 - switch ( cmd ) 32.981 - { 32.982 - case MMUEXT_PIN_L1_TABLE: 32.983 - case MMUEXT_PIN_L2_TABLE: 32.984 - /* 32.985 - * We insist that, if you pin an L1 page, it's the first thing that 32.986 - * you do to it. This is because we require the backptr to still be 32.987 - * mutable. This assumption seems safe. 32.988 - */ 32.989 - okay = get_page_and_type_from_pagenr( 32.990 - pfn, 32.991 - ((cmd==MMUEXT_PIN_L2_TABLE) ? 32.992 - PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)), 32.993 - FOREIGNDOM); 32.994 - 32.995 - if ( unlikely(!okay) ) 32.996 - { 32.997 - MEM_LOG("Error while pinning pfn %08lx", pfn); 32.998 - break; 32.999 - } 32.1000 - 32.1001 - if ( unlikely(test_and_set_bit(_PGT_pinned, 32.1002 - &page->u.inuse.type_info)) ) 32.1003 - { 32.1004 - MEM_LOG("Pfn %08lx already pinned", pfn); 32.1005 - put_page_and_type(page); 32.1006 - okay = 0; 32.1007 - break; 32.1008 - } 32.1009 - 32.1010 - break; 32.1011 - 32.1012 - case MMUEXT_UNPIN_TABLE: 32.1013 - if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) ) 32.1014 - { 32.1015 - MEM_LOG("Page %08lx bad domain (dom=%p)", 32.1016 - ptr, page_get_owner(page)); 32.1017 - } 32.1018 - else if ( likely(test_and_clear_bit(_PGT_pinned, 32.1019 - &page->u.inuse.type_info)) ) 32.1020 - { 32.1021 - put_page_and_type(page); 32.1022 - put_page(page); 32.1023 - } 32.1024 - else 32.1025 - { 32.1026 - okay = 0; 32.1027 - put_page(page); 32.1028 - MEM_LOG("Pfn %08lx not pinned", pfn); 32.1029 - } 32.1030 - break; 32.1031 - 32.1032 - case MMUEXT_NEW_BASEPTR: 32.1033 - okay = new_guest_cr3(pfn); 32.1034 - break; 32.1035 - 32.1036 - case MMUEXT_TLB_FLUSH: 32.1037 - percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB; 32.1038 - break; 32.1039 - 32.1040 - case MMUEXT_INVLPG: 32.1041 - __flush_tlb_one(ptr); 32.1042 - break; 32.1043 - 32.1044 - case MMUEXT_FLUSH_CACHE: 32.1045 - if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) ) 32.1046 - { 32.1047 - MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n"); 32.1048 - okay = 0; 32.1049 - } 32.1050 - else 32.1051 - { 32.1052 - wbinvd(); 32.1053 - } 32.1054 - break; 32.1055 - 32.1056 - case MMUEXT_SET_LDT: 32.1057 - { 32.1058 - unsigned long ents = val >> MMUEXT_CMD_SHIFT; 32.1059 - if ( ((ptr & (PAGE_SIZE-1)) != 0) || 32.1060 - (ents > 8192) || 32.1061 - ((ptr+ents*LDT_ENTRY_SIZE) < ptr) || 32.1062 - ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) ) 32.1063 - { 32.1064 - okay = 0; 32.1065 - MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents); 32.1066 - } 32.1067 - else if ( (ed->arch.ldt_ents != ents) || 32.1068 - (ed->arch.ldt_base != ptr) ) 32.1069 - { 32.1070 - invalidate_shadow_ldt(ed); 32.1071 - ed->arch.ldt_base = ptr; 32.1072 - ed->arch.ldt_ents = ents; 32.1073 - load_LDT(ed); 32.1074 - percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT; 32.1075 - if ( ents != 0 ) 32.1076 - percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT; 32.1077 - } 32.1078 - break; 32.1079 - } 32.1080 - 32.1081 - case MMUEXT_SET_FOREIGNDOM: 32.1082 - domid = (domid_t)(val >> 16); 32.1083 - 32.1084 - if ( (e = percpu_info[cpu].foreign) != NULL ) 32.1085 - put_domain(e); 32.1086 - percpu_info[cpu].foreign = NULL; 32.1087 - 32.1088 - if ( !IS_PRIV(d) ) 32.1089 - { 32.1090 - switch ( domid ) 32.1091 - { 32.1092 - case DOMID_IO: 32.1093 - get_knownalive_domain(dom_io); 32.1094 - percpu_info[cpu].foreign = dom_io; 32.1095 - break; 32.1096 - default: 32.1097 - MEM_LOG("Dom %u cannot set foreign dom\n", d->id); 32.1098 - okay = 0; 32.1099 - break; 32.1100 - } 32.1101 - } 32.1102 - else 32.1103 - { 32.1104 - percpu_info[cpu].foreign = e = find_domain_by_id(domid); 32.1105 - if ( e == NULL ) 32.1106 - { 32.1107 - switch ( domid ) 32.1108 - { 32.1109 - case DOMID_XEN: 32.1110 - get_knownalive_domain(dom_xen); 32.1111 - percpu_info[cpu].foreign = dom_xen; 32.1112 - break; 32.1113 - case DOMID_IO: 32.1114 - get_knownalive_domain(dom_io); 32.1115 - percpu_info[cpu].foreign = dom_io; 32.1116 - break; 32.1117 - default: 32.1118 - MEM_LOG("Unknown domain '%u'", domid); 32.1119 - okay = 0; 32.1120 - break; 32.1121 - } 32.1122 - } 32.1123 - } 32.1124 - break; 32.1125 - 32.1126 - case MMUEXT_TRANSFER_PAGE: 32.1127 - domid = (domid_t)(val >> 16); 32.1128 - gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF)); 32.1129 - 32.1130 - if ( unlikely(IS_XEN_HEAP_FRAME(page)) || 32.1131 - unlikely(!pfn_is_ram(pfn)) || 32.1132 - unlikely((e = find_domain_by_id(domid)) == NULL) ) 32.1133 - { 32.1134 - MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid); 32.1135 - okay = 0; 32.1136 - break; 32.1137 - } 32.1138 - 32.1139 - spin_lock(&d->page_alloc_lock); 32.1140 - 32.1141 - /* 32.1142 - * The tricky bit: atomically release ownership while there is just one 32.1143 - * benign reference to the page (PGC_allocated). If that reference 32.1144 - * disappears then the deallocation routine will safely spin. 32.1145 - */ 32.1146 - nd = page_get_owner(page); 32.1147 - y = page->count_info; 32.1148 - do { 32.1149 - x = y; 32.1150 - if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 32.1151 - (1|PGC_allocated)) || 32.1152 - unlikely(nd != d) ) 32.1153 - { 32.1154 - MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," 32.1155 - " caf=%08x, taf=%08x\n", page_to_pfn(page), 32.1156 - d, d->id, nd, x, page->u.inuse.type_info); 32.1157 - spin_unlock(&d->page_alloc_lock); 32.1158 - put_domain(e); 32.1159 - return 0; 32.1160 - } 32.1161 - __asm__ __volatile__( 32.1162 - LOCK_PREFIX "cmpxchg8b %2" 32.1163 - : "=d" (nd), "=a" (y), 32.1164 - "=m" (*(volatile u64 *)(&page->count_info)) 32.1165 - : "0" (d), "1" (x), "c" (NULL), "b" (x) ); 32.1166 - } 32.1167 - while ( unlikely(nd != d) || unlikely(y != x) ); 32.1168 - 32.1169 - /* 32.1170 - * Unlink from 'd'. At least one reference remains (now anonymous), so 32.1171 - * noone else is spinning to try to delete this page from 'd'. 32.1172 - */ 32.1173 - d->tot_pages--; 32.1174 - list_del(&page->list); 32.1175 - 32.1176 - spin_unlock(&d->page_alloc_lock); 32.1177 - 32.1178 - spin_lock(&e->page_alloc_lock); 32.1179 - 32.1180 - /* 32.1181 - * Check that 'e' will accept the page and has reservation headroom. 32.1182 - * Also, a domain mustn't have PGC_allocated pages when it is dying. 32.1183 - */ 32.1184 - ASSERT(e->tot_pages <= e->max_pages); 32.1185 - if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || 32.1186 - unlikely(e->tot_pages == e->max_pages) || 32.1187 - unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) ) 32.1188 - { 32.1189 - MEM_LOG("Transferee has no reservation headroom (%d,%d), or " 32.1190 - "provided a bad grant ref, or is dying (%08lx).\n", 32.1191 - e->tot_pages, e->max_pages, e->d_flags); 32.1192 - spin_unlock(&e->page_alloc_lock); 32.1193 - put_domain(e); 32.1194 - okay = 0; 32.1195 - break; 32.1196 - } 32.1197 - 32.1198 - /* Okay, add the page to 'e'. */ 32.1199 - if ( unlikely(e->tot_pages++ == 0) ) 32.1200 - get_knownalive_domain(e); 32.1201 - list_add_tail(&page->list, &e->page_list); 32.1202 - page_set_owner(page, e); 32.1203 - 32.1204 - spin_unlock(&e->page_alloc_lock); 32.1205 - 32.1206 - /* Transfer is all done: tell the guest about its new page frame. */ 32.1207 - gnttab_notify_transfer(e, gntref, pfn); 32.1208 - 32.1209 - put_domain(e); 32.1210 - break; 32.1211 - 32.1212 - case MMUEXT_REASSIGN_PAGE: 32.1213 - if ( unlikely(!IS_PRIV(d)) ) 32.1214 - { 32.1215 - MEM_LOG("Dom %u has no reassignment priv", d->id); 32.1216 - okay = 0; 32.1217 - break; 32.1218 - } 32.1219 - 32.1220 - e = percpu_info[cpu].foreign; 32.1221 - if ( unlikely(e == NULL) ) 32.1222 - { 32.1223 - MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn); 32.1224 - okay = 0; 32.1225 - break; 32.1226 - } 32.1227 - 32.1228 - /* 32.1229 - * Grab both page_list locks, in order. This prevents the page from 32.1230 - * disappearing elsewhere while we modify the owner, and we'll need 32.1231 - * both locks if we're successful so that we can change lists. 32.1232 - */ 32.1233 - if ( d < e ) 32.1234 - { 32.1235 - spin_lock(&d->page_alloc_lock); 32.1236 - spin_lock(&e->page_alloc_lock); 32.1237 - } 32.1238 - else 32.1239 - { 32.1240 - spin_lock(&e->page_alloc_lock); 32.1241 - spin_lock(&d->page_alloc_lock); 32.1242 - } 32.1243 - 32.1244 - /* A domain shouldn't have PGC_allocated pages when it is dying. */ 32.1245 - if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || 32.1246 - unlikely(IS_XEN_HEAP_FRAME(page)) ) 32.1247 - { 32.1248 - MEM_LOG("Reassignment page is Xen heap, or dest dom is dying."); 32.1249 - okay = 0; 32.1250 - goto reassign_fail; 32.1251 - } 32.1252 - 32.1253 - /* 32.1254 - * The tricky bit: atomically change owner while there is just one 32.1255 - * benign reference to the page (PGC_allocated). If that reference 32.1256 - * disappears then the deallocation routine will safely spin. 32.1257 - */ 32.1258 - nd = page_get_owner(page); 32.1259 - y = page->count_info; 32.1260 - do { 32.1261 - x = y; 32.1262 - if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 32.1263 - (1|PGC_allocated)) || 32.1264 - unlikely(nd != d) ) 32.1265 - { 32.1266 - MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," 32.1267 - " caf=%08x, taf=%08x\n", page_to_pfn(page), 32.1268 - d, d->id, nd, x, page->u.inuse.type_info); 32.1269 - okay = 0; 32.1270 - goto reassign_fail; 32.1271 - } 32.1272 - __asm__ __volatile__( 32.1273 - LOCK_PREFIX "cmpxchg8b %3" 32.1274 - : "=d" (nd), "=a" (y), "=c" (e), 32.1275 - "=m" (*(volatile u64 *)(&page->count_info)) 32.1276 - : "0" (d), "1" (x), "c" (e), "b" (x) ); 32.1277 - } 32.1278 - while ( unlikely(nd != d) || unlikely(y != x) ); 32.1279 - 32.1280 - /* 32.1281 - * Unlink from 'd'. We transferred at least one reference to 'e', so 32.1282 - * noone else is spinning to try to delete this page from 'd'. 32.1283 - */ 32.1284 - d->tot_pages--; 32.1285 - list_del(&page->list); 32.1286 - 32.1287 - /* 32.1288 - * Add the page to 'e'. Someone may already have removed the last 32.1289 - * reference and want to remove the page from 'e'. However, we have 32.1290 - * the lock so they'll spin waiting for us. 32.1291 - */ 32.1292 - if ( unlikely(e->tot_pages++ == 0) ) 32.1293 - get_knownalive_domain(e); 32.1294 - list_add_tail(&page->list, &e->page_list); 32.1295 - 32.1296 - reassign_fail: 32.1297 - spin_unlock(&d->page_alloc_lock); 32.1298 - spin_unlock(&e->page_alloc_lock); 32.1299 - break; 32.1300 - 32.1301 - case MMUEXT_CLEAR_FOREIGNDOM: 32.1302 - if ( (e = percpu_info[cpu].foreign) != NULL ) 32.1303 - put_domain(e); 32.1304 - percpu_info[cpu].foreign = NULL; 32.1305 - break; 32.1306 - 32.1307 - default: 32.1308 - MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK); 32.1309 - okay = 0; 32.1310 - break; 32.1311 - } 32.1312 - 32.1313 - return okay; 32.1314 -} 32.1315 - 32.1316 -int do_mmu_update( 32.1317 - mmu_update_t *ureqs, unsigned int count, unsigned int *pdone) 32.1318 -{ 32.1319 -/* 32.1320 - * We steal the m.s.b. of the @count parameter to indicate whether this 32.1321 - * invocation of do_mmu_update() is resuming a previously preempted call. 32.1322 - * We steal the next 15 bits to remember the current FOREIGNDOM. 32.1323 - */ 32.1324 -#define MMU_UPDATE_PREEMPTED (~(~0U>>1)) 32.1325 -#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16) 32.1326 -#define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT) 32.1327 - 32.1328 - mmu_update_t req; 32.1329 - unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0; 32.1330 - struct pfn_info *page; 32.1331 - int rc = 0, okay = 1, i = 0, cpu = smp_processor_id(); 32.1332 - unsigned int cmd, done = 0; 32.1333 - unsigned long prev_spfn = 0; 32.1334 - l1_pgentry_t *prev_spl1e = 0; 32.1335 - struct exec_domain *ed = current; 32.1336 - struct domain *d = ed->domain; 32.1337 - u32 type_info; 32.1338 - domid_t domid; 32.1339 - 32.1340 - LOCK_BIGLOCK(d); 32.1341 - 32.1342 - cleanup_writable_pagetable(d); 32.1343 - 32.1344 - /* 32.1345 - * If we are resuming after preemption, read how much work we have already 32.1346 - * done. This allows us to set the @done output parameter correctly. 32.1347 - * We also reset FOREIGNDOM here. 32.1348 - */ 32.1349 - if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) ) 32.1350 - { 32.1351 - if ( !(count & MMU_UPDATE_PREEMPTED) ) 32.1352 - { 32.1353 - /* Count overflow into private FOREIGNDOM field. */ 32.1354 - MEM_LOG("do_mmu_update count is too large"); 32.1355 - rc = -EINVAL; 32.1356 - goto out; 32.1357 - } 32.1358 - count &= ~MMU_UPDATE_PREEMPTED; 32.1359 - domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT; 32.1360 - count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK; 32.1361 - if ( unlikely(pdone != NULL) ) 32.1362 - (void)get_user(done, pdone); 32.1363 - if ( (domid != current->domain->id) && 32.1364 - !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) ) 32.1365 - { 32.1366 - rc = -EINVAL; 32.1367 - goto out; 32.1368 - } 32.1369 - } 32.1370 - 32.1371 - perfc_incrc(calls_to_mmu_update); 32.1372 - perfc_addc(num_page_updates, count); 32.1373 - 32.1374 - if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) ) 32.1375 - { 32.1376 - rc = -EFAULT; 32.1377 - goto out; 32.1378 - } 32.1379 - 32.1380 - for ( i = 0; i < count; i++ ) 32.1381 - { 32.1382 - if ( hypercall_preempt_check() ) 32.1383 - { 32.1384 - rc = hypercall_create_continuation( 32.1385 - __HYPERVISOR_mmu_update, 3, ureqs, 32.1386 - (count - i) | 32.1387 - (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 32.1388 - MMU_UPDATE_PREEMPTED, pdone); 32.1389 - break; 32.1390 - } 32.1391 - 32.1392 - if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) ) 32.1393 - { 32.1394 - MEM_LOG("Bad __copy_from_user"); 32.1395 - rc = -EFAULT; 32.1396 - break; 32.1397 - } 32.1398 - 32.1399 - cmd = req.ptr & (sizeof(l1_pgentry_t)-1); 32.1400 - pfn = req.ptr >> PAGE_SHIFT; 32.1401 - 32.1402 - okay = 0; 32.1403 - 32.1404 - switch ( cmd ) 32.1405 - { 32.1406 - /* 32.1407 - * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. 32.1408 - */ 32.1409 - case MMU_NORMAL_PT_UPDATE: 32.1410 - if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) ) 32.1411 - { 32.1412 - MEM_LOG("Could not get page for normal update"); 32.1413 - break; 32.1414 - } 32.1415 - 32.1416 - if ( likely(prev_pfn == pfn) ) 32.1417 - { 32.1418 - va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); 32.1419 - } 32.1420 - else 32.1421 - { 32.1422 - if ( prev_pfn != 0 ) 32.1423 - unmap_domain_mem((void *)va); 32.1424 - va = (unsigned long)map_domain_mem(req.ptr); 32.1425 - prev_pfn = pfn; 32.1426 - } 32.1427 - 32.1428 - page = &frame_table[pfn]; 32.1429 - switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) 32.1430 - { 32.1431 - case PGT_l1_page_table: 32.1432 - if ( likely(get_page_type( 32.1433 - page, type_info & (PGT_type_mask|PGT_va_mask))) ) 32.1434 - { 32.1435 - okay = mod_l1_entry((l1_pgentry_t *)va, 32.1436 - mk_l1_pgentry(req.val)); 32.1437 - 32.1438 - if ( unlikely(d->arch.shadow_mode) && okay && 32.1439 - (get_shadow_status(d, page-frame_table) & 32.1440 - PSH_shadowed) ) 32.1441 - { 32.1442 - shadow_l1_normal_pt_update( 32.1443 - req.ptr, req.val, &prev_spfn, &prev_spl1e); 32.1444 - put_shadow_status(d); 32.1445 - } 32.1446 - 32.1447 - put_page_type(page); 32.1448 - } 32.1449 - break; 32.1450 - case PGT_l2_page_table: 32.1451 - if ( likely(get_page_type(page, PGT_l2_page_table)) ) 32.1452 - { 32.1453 - okay = mod_l2_entry((l2_pgentry_t *)va, 32.1454 - mk_l2_pgentry(req.val), 32.1455 - pfn); 32.1456 - 32.1457 - if ( unlikely(d->arch.shadow_mode) && okay && 32.1458 - (get_shadow_status(d, page-frame_table) & 32.1459 - PSH_shadowed) ) 32.1460 - { 32.1461 - shadow_l2_normal_pt_update(req.ptr, req.val); 32.1462 - put_shadow_status(d); 32.1463 - } 32.1464 - 32.1465 - put_page_type(page); 32.1466 - } 32.1467 - break; 32.1468 - default: 32.1469 - if ( likely(get_page_type(page, PGT_writable_page)) ) 32.1470 - { 32.1471 - *(unsigned long *)va = req.val; 32.1472 - okay = 1; 32.1473 - put_page_type(page); 32.1474 - } 32.1475 - break; 32.1476 - } 32.1477 - 32.1478 - put_page(page); 32.1479 - break; 32.1480 - 32.1481 - case MMU_MACHPHYS_UPDATE: 32.1482 - if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) ) 32.1483 - { 32.1484 - MEM_LOG("Could not get page for mach->phys update"); 32.1485 - break; 32.1486 - } 32.1487 - 32.1488 - machine_to_phys_mapping[pfn] = req.val; 32.1489 - okay = 1; 32.1490 - 32.1491 - /* 32.1492 - * If in log-dirty mode, mark the corresponding pseudo-physical 32.1493 - * page as dirty. 32.1494 - */ 32.1495 - if ( unlikely(d->arch.shadow_mode == SHM_logdirty) && 32.1496 - mark_dirty(d, pfn) ) 32.1497 - d->arch.shadow_dirty_block_count++; 32.1498 - 32.1499 - put_page(&frame_table[pfn]); 32.1500 - break; 32.1501 - 32.1502 - /* 32.1503 - * MMU_EXTENDED_COMMAND: Extended command is specified 32.1504 - * in the least-siginificant bits of the 'value' field. 32.1505 - */ 32.1506 - case MMU_EXTENDED_COMMAND: 32.1507 - req.ptr &= ~(sizeof(l1_pgentry_t) - 1); 32.1508 - okay = do_extended_command(req.ptr, req.val); 32.1509 - break; 32.1510 - 32.1511 - default: 32.1512 - MEM_LOG("Invalid page update command %08lx", req.ptr); 32.1513 - break; 32.1514 - } 32.1515 - 32.1516 - if ( unlikely(!okay) ) 32.1517 - { 32.1518 - rc = -EINVAL; 32.1519 - break; 32.1520 - } 32.1521 - 32.1522 - ureqs++; 32.1523 - } 32.1524 - 32.1525 - out: 32.1526 - if ( prev_pfn != 0 ) 32.1527 - unmap_domain_mem((void *)va); 32.1528 - 32.1529 - if ( unlikely(prev_spl1e != 0) ) 32.1530 - unmap_domain_mem((void *)prev_spl1e); 32.1531 - 32.1532 - deferred_ops = percpu_info[cpu].deferred_ops; 32.1533 - percpu_info[cpu].deferred_ops = 0; 32.1534 - 32.1535 - if ( deferred_ops & DOP_FLUSH_TLB ) 32.1536 - local_flush_tlb(); 32.1537 - 32.1538 - if ( deferred_ops & DOP_RELOAD_LDT ) 32.1539 - (void)map_ldt_shadow_page(0); 32.1540 - 32.1541 - if ( unlikely(percpu_info[cpu].foreign != NULL) ) 32.1542 - { 32.1543 - put_domain(percpu_info[cpu].foreign); 32.1544 - percpu_info[cpu].foreign = NULL; 32.1545 - } 32.1546 - 32.1547 - /* Add incremental work we have done to the @done output parameter. */ 32.1548 - if ( unlikely(pdone != NULL) ) 32.1549 - __put_user(done + i, pdone); 32.1550 - 32.1551 - UNLOCK_BIGLOCK(d); 32.1552 - return rc; 32.1553 -} 32.1554 - 32.1555 - 32.1556 -int do_update_va_mapping(unsigned long page_nr, 32.1557 - unsigned long val, 32.1558 - unsigned long flags) 32.1559 -{ 32.1560 - struct exec_domain *ed = current; 32.1561 - struct domain *d = ed->domain; 32.1562 - int err = 0; 32.1563 - unsigned int cpu = ed->processor; 32.1564 - unsigned long deferred_ops; 32.1565 - 32.1566 - perfc_incrc(calls_to_update_va); 32.1567 - 32.1568 - if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) ) 32.1569 - return -EINVAL; 32.1570 - 32.1571 - LOCK_BIGLOCK(d); 32.1572 - 32.1573 - cleanup_writable_pagetable(d); 32.1574 - 32.1575 - /* 32.1576 - * XXX When we make this support 4MB superpages we should also deal with 32.1577 - * the case of updating L2 entries. 32.1578 - */ 32.1579 - 32.1580 - if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], 32.1581 - mk_l1_pgentry(val))) ) 32.1582 - err = -EINVAL; 32.1583 - 32.1584 - if ( unlikely(d->arch.shadow_mode) ) 32.1585 - { 32.1586 - unsigned long sval; 32.1587 - 32.1588 - l1pte_propagate_from_guest(d, &val, &sval); 32.1589 - 32.1590 - if ( unlikely(__put_user(sval, ((unsigned long *)( 32.1591 - &shadow_linear_pg_table[page_nr])))) ) 32.1592 - { 32.1593 - /* 32.1594 - * Since L2's are guranteed RW, failure indicates the page was not 32.1595 - * shadowed, so ignore. 32.1596 - */ 32.1597 - perfc_incrc(shadow_update_va_fail); 32.1598 - } 32.1599 - 32.1600 - /* 32.1601 - * If we're in log-dirty mode then we need to note that we've updated 32.1602 - * the PTE in the PT-holding page. We need the machine frame number 32.1603 - * for this. 32.1604 - */ 32.1605 - if ( d->arch.shadow_mode == SHM_logdirty ) 32.1606 - mark_dirty(d, va_to_l1mfn(page_nr << PAGE_SHIFT)); 32.1607 - 32.1608 - check_pagetable(d, ed->arch.pagetable, "va"); /* debug */ 32.1609 - } 32.1610 - 32.1611 - deferred_ops = percpu_info[cpu].deferred_ops; 32.1612 - percpu_info[cpu].deferred_ops = 0; 32.1613 - 32.1614 - if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 32.1615 - unlikely(flags & UVMF_FLUSH_TLB) ) 32.1616 - local_flush_tlb(); 32.1617 - else if ( unlikely(flags & UVMF_INVLPG) ) 32.1618 - __flush_tlb_one(page_nr << PAGE_SHIFT); 32.1619 - 32.1620 - if ( unlikely(deferred_ops & DOP_RELOAD_LDT) ) 32.1621 - (void)map_ldt_shadow_page(0); 32.1622 - 32.1623 - UNLOCK_BIGLOCK(d); 32.1624 - 32.1625 - return err; 32.1626 -} 32.1627 - 32.1628 -int do_update_va_mapping_otherdomain(unsigned long page_nr, 32.1629 - unsigned long val, 32.1630 - unsigned long flags, 32.1631 - domid_t domid) 32.1632 -{ 32.1633 - unsigned int cpu = smp_processor_id(); 32.1634 - struct domain *d; 32.1635 - int rc; 32.1636 - 32.1637 - if ( unlikely(!IS_PRIV(current->domain)) ) 32.1638 - return -EPERM; 32.1639 - 32.1640 - percpu_info[cpu].foreign = d = find_domain_by_id(domid); 32.1641 - if ( unlikely(d == NULL) ) 32.1642 - { 32.1643 - MEM_LOG("Unknown domain '%u'", domid); 32.1644 - return -ESRCH; 32.1645 - } 32.1646 - 32.1647 - rc = do_update_va_mapping(page_nr, val, flags); 32.1648 - 32.1649 - put_domain(d); 32.1650 - percpu_info[cpu].foreign = NULL; 32.1651 - 32.1652 - return rc; 32.1653 -} 32.1654 - 32.1655 - 32.1656 - 32.1657 -/************************* 32.1658 - * Writable Pagetables 32.1659 - */ 32.1660 - 32.1661 -ptwr_info_t ptwr_info[NR_CPUS]; 32.1662 - 32.1663 -#ifdef VERBOSE 32.1664 -int ptwr_debug = 0x0; 32.1665 -#define PTWR_PRINTK(_f, _a...) \ 32.1666 - do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 ) 32.1667 -#define PTWR_PRINT_WHICH (which ? 'I' : 'A') 32.1668 -#else 32.1669 -#define PTWR_PRINTK(_f, _a...) ((void)0) 32.1670 -#endif 32.1671 - 32.1672 -/* Flush the given writable p.t. page and write-protect it again. */ 32.1673 -void ptwr_flush(const int which) 32.1674 -{ 32.1675 - unsigned long sstat, spte, pte, *ptep, l1va; 32.1676 - l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e; 32.1677 - l2_pgentry_t *pl2e; 32.1678 - int i, cpu = smp_processor_id(); 32.1679 - struct exec_domain *ed = current; 32.1680 - struct domain *d = ed->domain; 32.1681 - 32.1682 - l1va = ptwr_info[cpu].ptinfo[which].l1va; 32.1683 - ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT]; 32.1684 - 32.1685 - /* 32.1686 - * STEP 1. Write-protect the p.t. page so no more updates can occur. 32.1687 - */ 32.1688 - 32.1689 - if ( unlikely(__get_user(pte, ptep)) ) 32.1690 - { 32.1691 - MEM_LOG("ptwr: Could not read pte at %p\n", ptep); 32.1692 - /* 32.1693 - * Really a bug. We could read this PTE during the initial fault, 32.1694 - * and pagetables can't have changed meantime. XXX Multi-CPU guests? 32.1695 - */ 32.1696 - BUG(); 32.1697 - } 32.1698 - PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n", 32.1699 - PTWR_PRINT_WHICH, ptep, pte); 32.1700 - pte &= ~_PAGE_RW; 32.1701 - 32.1702 - if ( unlikely(d->arch.shadow_mode) ) 32.1703 - { 32.1704 - /* Write-protect the p.t. page in the shadow page table. */ 32.1705 - l1pte_propagate_from_guest(d, &pte, &spte); 32.1706 - __put_user( 32.1707 - spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]); 32.1708 - 32.1709 - /* Is the p.t. page itself shadowed? Map it into Xen space if so. */ 32.1710 - sstat = get_shadow_status(d, pte >> PAGE_SHIFT); 32.1711 - if ( sstat & PSH_shadowed ) 32.1712 - sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT); 32.1713 - } 32.1714 - 32.1715 - /* Write-protect the p.t. page in the guest page table. */ 32.1716 - if ( unlikely(__put_user(pte, ptep)) ) 32.1717 - { 32.1718 - MEM_LOG("ptwr: Could not update pte at %p\n", ptep); 32.1719 - /* 32.1720 - * Really a bug. We could write this PTE during the initial fault, 32.1721 - * and pagetables can't have changed meantime. XXX Multi-CPU guests? 32.1722 - */ 32.1723 - BUG(); 32.1724 - } 32.1725 - 32.1726 - /* Ensure that there are no stale writable mappings in any TLB. */ 32.1727 - /* NB. INVLPG is a serialising instruction: flushes pending updates. */ 32.1728 -#if 1 32.1729 - __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */ 32.1730 -#else 32.1731 - flush_tlb_all(); 32.1732 -#endif 32.1733 - PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n", 32.1734 - PTWR_PRINT_WHICH, ptep, pte); 32.1735 - 32.1736 - /* 32.1737 - * STEP 2. Validate any modified PTEs. 32.1738 - */ 32.1739 - 32.1740 - pl1e = ptwr_info[cpu].ptinfo[which].pl1e; 32.1741 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 32.1742 - { 32.1743 - ol1e = ptwr_info[cpu].ptinfo[which].page[i]; 32.1744 - nl1e = pl1e[i]; 32.1745 - 32.1746 - if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) ) 32.1747 - continue; 32.1748 - 32.1749 - /* 32.1750 - * Fast path for PTEs that have merely been write-protected 32.1751 - * (e.g., during a Unix fork()). A strict reduction in privilege. 32.1752 - */ 32.1753 - if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) ) 32.1754 - { 32.1755 - if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) ) 32.1756 - { 32.1757 - if ( unlikely(sl1e != NULL) ) 32.1758 - l1pte_propagate_from_guest( 32.1759 - d, &l1_pgentry_val(nl1e), 32.1760 - &l1_pgentry_val(sl1e[i])); 32.1761 - put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]); 32.1762 - } 32.1763 - continue; 32.1764 - } 32.1765 - 32.1766 - if ( unlikely(!get_page_from_l1e(nl1e, d)) ) 32.1767 - { 32.1768 - MEM_LOG("ptwr: Could not re-validate l1 page\n"); 32.1769 - /* 32.1770 - * Make the remaining p.t's consistent before crashing, so the 32.1771 - * reference counts are correct. 32.1772 - */ 32.1773 - memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i], 32.1774 - (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t)); 32.1775 - unmap_domain_mem(pl1e); 32.1776 - ptwr_info[cpu].ptinfo[which].l1va = 0; 32.1777 - UNLOCK_BIGLOCK(d); 32.1778 - domain_crash(); 32.1779 - } 32.1780 - 32.1781 - if ( unlikely(sl1e != NULL) ) 32.1782 - l1pte_propagate_from_guest( 32.1783 - d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i])); 32.1784 - 32.1785 - if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) ) 32.1786 - put_page_from_l1e(ol1e, d); 32.1787 - } 32.1788 - unmap_domain_mem(pl1e); 32.1789 - 32.1790 - /* 32.1791 - * STEP 3. Reattach the L1 p.t. page into the current address space. 32.1792 - */ 32.1793 - 32.1794 - if ( (which == PTWR_PT_ACTIVE) && likely(!d->arch.shadow_mode) ) 32.1795 - { 32.1796 - pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx]; 32.1797 - *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 32.1798 - } 32.1799 - 32.1800 - /* 32.1801 - * STEP 4. Final tidy-up. 32.1802 - */ 32.1803 - 32.1804 - ptwr_info[cpu].ptinfo[which].l1va = 0; 32.1805 - 32.1806 - if ( unlikely(sl1e != NULL) ) 32.1807 - { 32.1808 - unmap_domain_mem(sl1e); 32.1809 - put_shadow_status(d); 32.1810 - } 32.1811 -} 32.1812 - 32.1813 -/* Write page fault handler: check if guest is trying to modify a PTE. */ 32.1814 -int ptwr_do_page_fault(unsigned long addr) 32.1815 -{ 32.1816 - unsigned long pte, pfn, l2e; 32.1817 - struct pfn_info *page; 32.1818 - l2_pgentry_t *pl2e; 32.1819 - int which, cpu = smp_processor_id(); 32.1820 - u32 l2_idx; 32.1821 - 32.1822 - /* 32.1823 - * Attempt to read the PTE that maps the VA being accessed. By checking for 32.1824 - * PDE validity in the L2 we avoid many expensive fixups in __get_user(). 32.1825 - */ 32.1826 - if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) & 32.1827 - _PAGE_PRESENT) || 32.1828 - __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) ) 32.1829 - { 32.1830 - return 0; 32.1831 - } 32.1832 - 32.1833 - pfn = pte >> PAGE_SHIFT; 32.1834 - page = &frame_table[pfn]; 32.1835 - 32.1836 - /* We are looking only for read-only mappings of p.t. pages. */ 32.1837 - if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) || 32.1838 - ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ) 32.1839 - { 32.1840 - return 0; 32.1841 - } 32.1842 - 32.1843 - /* Get the L2 index at which this L1 p.t. is always mapped. */ 32.1844 - l2_idx = page->u.inuse.type_info & PGT_va_mask; 32.1845 - if ( unlikely(l2_idx >= PGT_va_unknown) ) 32.1846 - { 32.1847 - domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */ 32.1848 - } 32.1849 - l2_idx >>= PGT_va_shift; 32.1850 - 32.1851 - if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) ) 32.1852 - { 32.1853 - MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr); 32.1854 - domain_crash(); 32.1855 - } 32.1856 - 32.1857 - /* 32.1858 - * Is the L1 p.t. mapped into the current address space? If so we call it 32.1859 - * an ACTIVE p.t., otherwise it is INACTIVE. 32.1860 - */ 32.1861 - pl2e = &linear_l2_table[l2_idx]; 32.1862 - l2e = l2_pgentry_val(*pl2e); 32.1863 - which = PTWR_PT_INACTIVE; 32.1864 - if ( (l2e >> PAGE_SHIFT) == pfn ) 32.1865 - { 32.1866 - /* Check the PRESENT bit to set ACTIVE. */ 32.1867 - if ( likely(l2e & _PAGE_PRESENT) ) 32.1868 - which = PTWR_PT_ACTIVE; 32.1869 - else { 32.1870 - /* 32.1871 - * If the PRESENT bit is clear, we may be conflicting with 32.1872 - * the current ACTIVE p.t. (it may be the same p.t. mapped 32.1873 - * at another virt addr). 32.1874 - * The ptwr_flush call below will restore the PRESENT bit. 32.1875 - */ 32.1876 - if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va && 32.1877 - l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx ) 32.1878 - which = PTWR_PT_ACTIVE; 32.1879 - } 32.1880 - } 32.1881 - 32.1882 - PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, " 32.1883 - "pfn %08lx\n", PTWR_PRINT_WHICH, 32.1884 - addr, l2_idx << L2_PAGETABLE_SHIFT, pfn); 32.1885 - 32.1886 - /* 32.1887 - * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 32.1888 - * time. If there is already one, we must flush it out. 32.1889 - */ 32.1890 - if ( ptwr_info[cpu].ptinfo[which].l1va ) 32.1891 - ptwr_flush(which); 32.1892 - 32.1893 - ptwr_info[cpu].ptinfo[which].l1va = addr | 1; 32.1894 - ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx; 32.1895 - 32.1896 - /* For safety, disconnect the L1 p.t. page from current space. */ 32.1897 - if ( (which == PTWR_PT_ACTIVE) && 32.1898 - likely(!current->domain->arch.shadow_mode) ) 32.1899 - { 32.1900 - *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT); 32.1901 -#if 1 32.1902 - flush_tlb(); /* XXX Multi-CPU guests? */ 32.1903 -#else 32.1904 - flush_tlb_all(); 32.1905 -#endif 32.1906 - } 32.1907 - 32.1908 - /* Temporarily map the L1 page, and make a copy of it. */ 32.1909 - ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT); 32.1910 - memcpy(ptwr_info[cpu].ptinfo[which].page, 32.1911 - ptwr_info[cpu].ptinfo[which].pl1e, 32.1912 - ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t)); 32.1913 - 32.1914 - /* Finally, make the p.t. page writable by the guest OS. */ 32.1915 - pte |= _PAGE_RW; 32.1916 - PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH, 32.1917 - &linear_pg_table[addr>>PAGE_SHIFT], pte); 32.1918 - if ( unlikely(__put_user(pte, (unsigned long *) 32.1919 - &linear_pg_table[addr>>PAGE_SHIFT])) ) 32.1920 - { 32.1921 - MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *) 32.1922 - &linear_pg_table[addr>>PAGE_SHIFT]); 32.1923 - /* Toss the writable pagetable state and crash. */ 32.1924 - unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e); 32.1925 - ptwr_info[cpu].ptinfo[which].l1va = 0; 32.1926 - domain_crash(); 32.1927 - } 32.1928 - 32.1929 - return EXCRET_fault_fixed; 32.1930 -} 32.1931 - 32.1932 -static __init int ptwr_init(void) 32.1933 -{ 32.1934 - int i; 32.1935 - 32.1936 - for ( i = 0; i < smp_num_cpus; i++ ) 32.1937 - { 32.1938 - ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page = 32.1939 - (void *)alloc_xenheap_page(); 32.1940 - ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page = 32.1941 - (void *)alloc_xenheap_page(); 32.1942 - } 32.1943 - 32.1944 - return 0; 32.1945 -} 32.1946 -__initcall(ptwr_init); 32.1947 - 32.1948 - 32.1949 - 32.1950 - 32.1951 -/************************************************************************/ 32.1952 -/************************************************************************/ 32.1953 -/************************************************************************/ 32.1954 - 32.1955 -#ifndef NDEBUG 32.1956 - 32.1957 -void ptwr_status(void) 32.1958 -{ 32.1959 - unsigned long pte, *ptep, pfn; 32.1960 - struct pfn_info *page; 32.1961 - int cpu = smp_processor_id(); 32.1962 - 32.1963 - ptep = (unsigned long *)&linear_pg_table 32.1964 - [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT]; 32.1965 - 32.1966 - if ( __get_user(pte, ptep) ) { 32.1967 - MEM_LOG("ptwr: Could not read pte at %p\n", ptep); 32.1968 - domain_crash(); 32.1969 - } 32.1970 - 32.1971 - pfn = pte >> PAGE_SHIFT; 32.1972 - page = &frame_table[pfn]; 32.1973 - printk("need to alloc l1 page %p\n", page); 32.1974 - /* make pt page writable */ 32.1975 - printk("need to make read-only l1-page at %p is %08lx\n", 32.1976 - ptep, pte); 32.1977 - 32.1978 - if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 ) 32.1979 - return; 32.1980 - 32.1981 - if ( __get_user(pte, (unsigned long *) 32.1982 - ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) { 32.1983 - MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *) 32.1984 - ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va); 32.1985 - domain_crash(); 32.1986 - } 32.1987 - pfn = pte >> PAGE_SHIFT; 32.1988 - page = &frame_table[pfn]; 32.1989 -} 32.1990 - 32.1991 -void audit_domain(struct domain *d) 32.1992 -{ 32.1993 - int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; 32.1994 - 32.1995 - void adjust (struct pfn_info *page, int dir, int adjtype) 32.1996 - { 32.1997 - int count = page->count_info & PGC_count_mask; 32.1998 - 32.1999 - if ( adjtype ) 32.2000 - { 32.2001 - int tcount = page->u.inuse.type_info & PGT_count_mask; 32.2002 - 32.2003 - ttot++; 32.2004 - 32.2005 - tcount += dir; 32.2006 - 32.2007 - if ( tcount < 0 ) 32.2008 - { 32.2009 - /* This will only come out once. */ 32.2010 - printk("Audit %d: type count whent below zero pfn=%x " 32.2011 - "taf=%x otaf=%x\n", 32.2012 - d->id, page-frame_table, 32.2013 - page->u.inuse.type_info, 32.2014 - page->tlbflush_timestamp); 32.2015 - } 32.2016 - 32.2017 - page->u.inuse.type_info = 32.2018 - (page->u.inuse.type_info & ~PGT_count_mask) | 32.2019 - (tcount & PGT_count_mask); 32.2020 - } 32.2021 - 32.2022 - ctot++; 32.2023 - count += dir; 32.2024 - if ( count < 0 ) 32.2025 - { 32.2026 - /* This will only come out once. */ 32.2027 - printk("Audit %d: general count whent below zero pfn=%x " 32.2028 - "taf=%x otaf=%x\n", 32.2029 - d->id, page-frame_table, 32.2030 - page->u.inuse.type_info, 32.2031 - page->tlbflush_timestamp); 32.2032 - } 32.2033 - 32.2034 - page->count_info = 32.2035 - (page->count_info & ~PGC_count_mask) | 32.2036 - (count & PGC_count_mask); 32.2037 - 32.2038 - } 32.2039 - 32.2040 - void scan_for_pfn(struct domain *d, unsigned long xpfn) 32.2041 - { 32.2042 - unsigned long pfn, *pt; 32.2043 - struct list_head *list_ent; 32.2044 - struct pfn_info *page; 32.2045 - int i; 32.2046 - 32.2047 - list_ent = d->page_list.next; 32.2048 - for ( i = 0; (list_ent != &d->page_list); i++ ) 32.2049 - { 32.2050 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 32.2051 - page = &frame_table[pfn]; 32.2052 - 32.2053 - switch ( page->u.inuse.type_info & PGT_type_mask ) 32.2054 - { 32.2055 - case PGT_l1_page_table: 32.2056 - case PGT_l2_page_table: 32.2057 - pt = map_domain_mem(pfn<<PAGE_SHIFT); 32.2058 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 32.2059 - if ( (pt[i] & _PAGE_PRESENT) && 32.2060 - ((pt[i] >> PAGE_SHIFT) == xpfn) ) 32.2061 - printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n", 32.2062 - d->id, i, pfn, page->u.inuse.type_info, 32.2063 - page->count_info); 32.2064 - unmap_domain_mem(pt); 32.2065 - } 32.2066 - 32.2067 - list_ent = frame_table[pfn].list.next; 32.2068 - } 32.2069 - 32.2070 - } 32.2071 - 32.2072 - void scan_for_pfn_remote(unsigned long xpfn) 32.2073 - { 32.2074 - struct domain *e; 32.2075 - for_each_domain ( e ) 32.2076 - scan_for_pfn( e, xpfn ); 32.2077 - } 32.2078 - 32.2079 - int i; 32.2080 - unsigned long pfn; 32.2081 - struct list_head *list_ent; 32.2082 - struct pfn_info *page; 32.2083 - 32.2084 - if ( d != current->domain ) 32.2085 - domain_pause(d); 32.2086 - synchronise_pagetables(~0UL); 32.2087 - 32.2088 - printk("pt base=%lx sh_info=%x\n", 32.2089 - pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT, 32.2090 - virt_to_page(d->shared_info)-frame_table); 32.2091 - 32.2092 - spin_lock(&d->page_alloc_lock); 32.2093 - 32.2094 - /* PHASE 0 */ 32.2095 - 32.2096 - list_ent = d->page_list.next; 32.2097 - for ( i = 0; (list_ent != &d->page_list); i++ ) 32.2098 - { 32.2099 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 32.2100 - page = &frame_table[pfn]; 32.2101 - 32.2102 - if ( page_get_owner(page) != d ) 32.2103 - BUG(); 32.2104 - 32.2105 - if ( (page->u.inuse.type_info & PGT_count_mask) > 32.2106 - (page->count_info & PGC_count_mask) ) 32.2107 - printk("taf > caf %x %x pfn=%lx\n", 32.2108 - page->u.inuse.type_info, page->count_info, pfn ); 32.2109 - 32.2110 -#if 0 /* SYSV shared memory pages plus writeable files. */ 32.2111 - if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 32.2112 - (page->u.inuse.type_info & PGT_count_mask) > 1 ) 32.2113 - { 32.2114 - printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n", 32.2115 - pfn, 32.2116 - page->u.inuse.type_info, 32.2117 - page->count_info ); 32.2118 - scan_for_pfn_remote(pfn); 32.2119 - } 32.2120 -#endif 32.2121 - if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 32.2122 - (page->u.inuse.type_info & PGT_count_mask) > 1 ) 32.2123 - { 32.2124 - printk("normal page with type count >1: pfn=%lx t=%x c=%x\n", 32.2125 - pfn, 32.2126 - page->u.inuse.type_info, 32.2127 - page->count_info ); 32.2128 - } 32.2129 - 32.2130 - /* Use tlbflush_timestamp to store original type_info. */ 32.2131 - page->tlbflush_timestamp = page->u.inuse.type_info; 32.2132 - 32.2133 - list_ent = frame_table[pfn].list.next; 32.2134 - } 32.2135 - 32.2136 - 32.2137 - /* PHASE 1 */ 32.2138 - 32.2139 - adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1); 32.2140 - 32.2141 - list_ent = d->page_list.next; 32.2142 - for ( i = 0; (list_ent != &d->page_list); i++ ) 32.2143 - { 32.2144 - unsigned long *pt; 32.2145 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 32.2146 - page = &frame_table[pfn]; 32.2147 - 32.2148 - if ( page_get_owner(page) != d ) 32.2149 - BUG(); 32.2150 - 32.2151 - switch ( page->u.inuse.type_info & PGT_type_mask ) 32.2152 - { 32.2153 - case PGT_l2_page_table: 32.2154 - 32.2155 - if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) 32.2156 - printk("Audit %d: L2 not validated %x\n", 32.2157 - d->id, page->u.inuse.type_info); 32.2158 - 32.2159 - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) 32.2160 - printk("Audit %d: L2 not pinned %x\n", 32.2161 - d->id, page->u.inuse.type_info); 32.2162 - else 32.2163 - adjust( page, -1, 1 ); 32.2164 - 32.2165 - pt = map_domain_mem( pfn<<PAGE_SHIFT ); 32.2166 - 32.2167 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 32.2168 - { 32.2169 - if ( pt[i] & _PAGE_PRESENT ) 32.2170 - { 32.2171 - unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 32.2172 - struct pfn_info *l1page = &frame_table[l1pfn]; 32.2173 - 32.2174 - if ( page_get_owner(l1page) != d ) 32.2175 - { 32.2176 - printk("L2: Skip bizarre page belonging to other " 32.2177 - "dom %p\n", page_get_owner(l1page)); 32.2178 - continue; 32.2179 - } 32.2180 - 32.2181 - if ( (l1page->u.inuse.type_info & PGT_type_mask) == 32.2182 - PGT_l2_page_table ) 32.2183 - printk("Audit %d: [%x] Found %s Linear PT " 32.2184 - "t=%x pfn=%lx\n", d->id, i, 32.2185 - (l1pfn==pfn) ? "Self" : "Other", 32.2186 - l1page->u.inuse.type_info, 32.2187 - l1pfn); 32.2188 - else if ( (l1page->u.inuse.type_info & PGT_type_mask) != 32.2189 - PGT_l1_page_table ) 32.2190 - printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n", 32.2191 - d->id, i, 32.2192 - l1page->u.inuse.type_info, 32.2193 - l1pfn); 32.2194 - 32.2195 - adjust(l1page, -1, 1); 32.2196 - } 32.2197 - } 32.2198 - 32.2199 - unmap_domain_mem(pt); 32.2200 - 32.2201 - break; 32.2202 - 32.2203 - 32.2204 - case PGT_l1_page_table: 32.2205 - 32.2206 - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 32.2207 - adjust( page, -1, 1 ); 32.2208 - 32.2209 - if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) 32.2210 - printk("Audit %d: L1 not validated %x\n", 32.2211 - d->id, page->u.inuse.type_info); 32.2212 -#if 0 32.2213 - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) 32.2214 - printk("Audit %d: L1 not pinned %x\n", 32.2215 - d->id, page->u.inuse.type_info); 32.2216 -#endif 32.2217 - pt = map_domain_mem( pfn<<PAGE_SHIFT ); 32.2218 - 32.2219 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 32.2220 - { 32.2221 - if ( pt[i] & _PAGE_PRESENT ) 32.2222 - { 32.2223 - unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 32.2224 - struct pfn_info *l1page = &frame_table[l1pfn]; 32.2225 - 32.2226 - if ( l1pfn < 0x100 ) 32.2227 - { 32.2228 - lowmem_mappings++; 32.2229 - continue; 32.2230 - } 32.2231 - 32.2232 - if ( l1pfn > max_page ) 32.2233 - { 32.2234 - io_mappings++; 32.2235 - continue; 32.2236 - } 32.2237 - 32.2238 - if ( pt[i] & _PAGE_RW ) 32.2239 - { 32.2240 - 32.2241 - if ( (l1page->u.inuse.type_info & PGT_type_mask) == 32.2242 - PGT_l1_page_table || 32.2243 - (l1page->u.inuse.type_info & PGT_type_mask) == 32.2244 - PGT_l2_page_table ) 32.2245 - printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n", 32.2246 - d->id, i, 32.2247 - l1page->u.inuse.type_info, 32.2248 - l1pfn); 32.2249 - 32.2250 - } 32.2251 - 32.2252 - if ( page_get_owner(l1page) != d ) 32.2253 - { 32.2254 - printk("Audit %d: [%lx,%x] Skip foreign page dom=%p " 32.2255 - "pfn=%lx c=%08x t=%08x m2p=%lx\n", 32.2256 - d->id, pfn, i, 32.2257 - page_get_owner(l1page), 32.2258 - l1pfn, 32.2259 - l1page->count_info, 32.2260 - l1page->u.inuse.type_info, 32.2261 - machine_to_phys_mapping[l1pfn]); 32.2262 - continue; 32.2263 - } 32.2264 - 32.2265 - adjust(l1page, -1, 0); 32.2266 - } 32.2267 - } 32.2268 - 32.2269 - unmap_domain_mem(pt); 32.2270 - 32.2271 - break; 32.2272 - } 32.2273 - 32.2274 - list_ent = frame_table[pfn].list.next; 32.2275 - } 32.2276 - 32.2277 - if ( (io_mappings > 0) || (lowmem_mappings > 0) ) 32.2278 - printk("Audit %d: Found %d lowmem mappings and %d io mappings\n", 32.2279 - d->id, lowmem_mappings, io_mappings); 32.2280 - 32.2281 - /* PHASE 2 */ 32.2282 - 32.2283 - ctot = ttot = 0; 32.2284 - list_ent = d->page_list.next; 32.2285 - for ( i = 0; (list_ent != &d->page_list); i++ ) 32.2286 - { 32.2287 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 32.2288 - page = &frame_table[pfn]; 32.2289 - 32.2290 - switch ( page->u.inuse.type_info & PGT_type_mask) 32.2291 - { 32.2292 - case PGT_l1_page_table: 32.2293 - case PGT_l2_page_table: 32.2294 - if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) 32.2295 - { 32.2296 - printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n", 32.2297 - d->id, page->u.inuse.type_info, 32.2298 - page->tlbflush_timestamp, 32.2299 - page->count_info, pfn ); 32.2300 - scan_for_pfn_remote(pfn); 32.2301 - } 32.2302 - default: 32.2303 - if ( (page->count_info & PGC_count_mask) != 1 ) 32.2304 - { 32.2305 - printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n", 32.2306 - d->id, 32.2307 - page->count_info, 32.2308 - page->u.inuse.type_info, 32.2309 - page->tlbflush_timestamp, pfn ); 32.2310 - scan_for_pfn_remote(pfn); 32.2311 - } 32.2312 - break; 32.2313 - } 32.2314 - 32.2315 - list_ent = frame_table[pfn].list.next; 32.2316 - } 32.2317 - 32.2318 - /* PHASE 3 */ 32.2319 - 32.2320 - list_ent = d->page_list.next; 32.2321 - for ( i = 0; (list_ent != &d->page_list); i++ ) 32.2322 - { 32.2323 - unsigned long *pt; 32.2324 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 32.2325 - page = &frame_table[pfn]; 32.2326 - 32.2327 - switch ( page->u.inuse.type_info & PGT_type_mask ) 32.2328 - { 32.2329 - case PGT_l2_page_table: 32.2330 - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 32.2331 - adjust( page, 1, 1 ); 32.2332 - 32.2333 - pt = map_domain_mem( pfn<<PAGE_SHIFT ); 32.2334 - 32.2335 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 32.2336 - { 32.2337 - if ( pt[i] & _PAGE_PRESENT ) 32.2338 - { 32.2339 - unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 32.2340 - struct pfn_info *l1page = &frame_table[l1pfn]; 32.2341 - 32.2342 - if ( page_get_owner(l1page) == d ) 32.2343 - adjust(l1page, 1, 1); 32.2344 - } 32.2345 - } 32.2346 - 32.2347 - unmap_domain_mem(pt); 32.2348 - break; 32.2349 - 32.2350 - case PGT_l1_page_table: 32.2351 - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 32.2352 - adjust( page, 1, 1 ); 32.2353 - 32.2354 - pt = map_domain_mem( pfn<<PAGE_SHIFT ); 32.2355 - 32.2356 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 32.2357 - { 32.2358 - if ( pt[i] & _PAGE_PRESENT ) 32.2359 - { 32.2360 - unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 32.2361 - struct pfn_info *l1page = &frame_table[l1pfn]; 32.2362 - 32.2363 - if ( (page_get_owner(l1page) != d) || 32.2364 - (l1pfn < 0x100) || (l1pfn > max_page) ) 32.2365 - continue; 32.2366 - 32.2367 - adjust(l1page, 1, 0); 32.2368 - } 32.2369 - } 32.2370 - 32.2371 - unmap_domain_mem(pt); 32.2372 - break; 32.2373 - } 32.2374 - 32.2375 - 32.2376 - page->tlbflush_timestamp = 0; 32.2377 - 32.2378 - list_ent = frame_table[pfn].list.next; 32.2379 - } 32.2380 - 32.2381 - spin_unlock(&d->page_alloc_lock); 32.2382 - 32.2383 - adjust(&frame_table[pagetable_val( 32.2384 - d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1); 32.2385 - 32.2386 - printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot ); 32.2387 - 32.2388 - if ( d != current->domain ) 32.2389 - domain_unpause(d); 32.2390 -} 32.2391 - 32.2392 -void audit_domains(void) 32.2393 -{ 32.2394 - struct domain *d; 32.2395 - for_each_domain ( d ) 32.2396 - audit_domain(d); 32.2397 -} 32.2398 - 32.2399 -void audit_domains_key(unsigned char key) 32.2400 -{ 32.2401 - audit_domains(); 32.2402 -} 32.2403 - 32.2404 -#endif
33.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 33.2 +++ b/xen/arch/x86/mm.c Tue Feb 08 16:44:16 2005 +0000 33.3 @@ -0,0 +1,2598 @@ 33.4 +/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ 33.5 +/****************************************************************************** 33.6 + * arch/x86/mm.c 33.7 + * 33.8 + * Copyright (c) 2002-2005 K A Fraser 33.9 + * Copyright (c) 2004 Christian Limpach 33.10 + * 33.11 + * This program is free software; you can redistribute it and/or modify 33.12 + * it under the terms of the GNU General Public License as published by 33.13 + * the Free Software Foundation; either version 2 of the License, or 33.14 + * (at your option) any later version. 33.15 + * 33.16 + * This program is distributed in the hope that it will be useful, 33.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 33.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 33.19 + * GNU General Public License for more details. 33.20 + * 33.21 + * You should have received a copy of the GNU General Public License 33.22 + * along with this program; if not, write to the Free Software 33.23 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 33.24 + */ 33.25 + 33.26 +/* 33.27 + * A description of the x86 page table API: 33.28 + * 33.29 + * Domains trap to do_mmu_update with a list of update requests. 33.30 + * This is a list of (ptr, val) pairs, where the requested operation 33.31 + * is *ptr = val. 33.32 + * 33.33 + * Reference counting of pages: 33.34 + * ---------------------------- 33.35 + * Each page has two refcounts: tot_count and type_count. 33.36 + * 33.37 + * TOT_COUNT is the obvious reference count. It counts all uses of a 33.38 + * physical page frame by a domain, including uses as a page directory, 33.39 + * a page table, or simple mappings via a PTE. This count prevents a 33.40 + * domain from releasing a frame back to the free pool when it still holds 33.41 + * a reference to it. 33.42 + * 33.43 + * TYPE_COUNT is more subtle. A frame can be put to one of three 33.44 + * mutually-exclusive uses: it might be used as a page directory, or a 33.45 + * page table, or it may be mapped writable by the domain [of course, a 33.46 + * frame may not be used in any of these three ways!]. 33.47 + * So, type_count is a count of the number of times a frame is being 33.48 + * referred to in its current incarnation. Therefore, a page can only 33.49 + * change its type when its type count is zero. 33.50 + * 33.51 + * Pinning the page type: 33.52 + * ---------------------- 33.53 + * The type of a page can be pinned/unpinned with the commands 33.54 + * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, 33.55 + * pinning is not reference counted, so it can't be nested). 33.56 + * This is useful to prevent a page's type count falling to zero, at which 33.57 + * point safety checks would need to be carried out next time the count 33.58 + * is increased again. 33.59 + * 33.60 + * A further note on writable page mappings: 33.61 + * ----------------------------------------- 33.62 + * For simplicity, the count of writable mappings for a page may not 33.63 + * correspond to reality. The 'writable count' is incremented for every 33.64 + * PTE which maps the page with the _PAGE_RW flag set. However, for 33.65 + * write access to be possible the page directory entry must also have 33.66 + * its _PAGE_RW bit set. We do not check this as it complicates the 33.67 + * reference counting considerably [consider the case of multiple 33.68 + * directory entries referencing a single page table, some with the RW 33.69 + * bit set, others not -- it starts getting a bit messy]. 33.70 + * In normal use, this simplification shouldn't be a problem. 33.71 + * However, the logic can be added if required. 33.72 + * 33.73 + * One more note on read-only page mappings: 33.74 + * ----------------------------------------- 33.75 + * We want domains to be able to map pages for read-only access. The 33.76 + * main reason is that page tables and directories should be readable 33.77 + * by a domain, but it would not be safe for them to be writable. 33.78 + * However, domains have free access to rings 1 & 2 of the Intel 33.79 + * privilege model. In terms of page protection, these are considered 33.80 + * to be part of 'supervisor mode'. The WP bit in CR0 controls whether 33.81 + * read-only restrictions are respected in supervisor mode -- if the 33.82 + * bit is clear then any mapped page is writable. 33.83 + * 33.84 + * We get round this by always setting the WP bit and disallowing 33.85 + * updates to it. This is very unlikely to cause a problem for guest 33.86 + * OS's, which will generally use the WP bit to simplify copy-on-write 33.87 + * implementation (in that case, OS wants a fault when it writes to 33.88 + * an application-supplied buffer). 33.89 + */ 33.90 + 33.91 +#include <xen/config.h> 33.92 +#include <xen/init.h> 33.93 +#include <xen/kernel.h> 33.94 +#include <xen/lib.h> 33.95 +#include <xen/mm.h> 33.96 +#include <xen/sched.h> 33.97 +#include <xen/errno.h> 33.98 +#include <xen/perfc.h> 33.99 +#include <xen/irq.h> 33.100 +#include <xen/softirq.h> 33.101 +#include <asm/shadow.h> 33.102 +#include <asm/page.h> 33.103 +#include <asm/flushtlb.h> 33.104 +#include <asm/io.h> 33.105 +#include <asm/uaccess.h> 33.106 +#include <asm/domain_page.h> 33.107 +#include <asm/ldt.h> 33.108 + 33.109 +#ifdef VERBOSE 33.110 +#define MEM_LOG(_f, _a...) \ 33.111 + printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \ 33.112 + current->domain->id , __LINE__ , ## _a ) 33.113 +#else 33.114 +#define MEM_LOG(_f, _a...) ((void)0) 33.115 +#endif 33.116 + 33.117 +static int alloc_l2_table(struct pfn_info *page); 33.118 +static int alloc_l1_table(struct pfn_info *page); 33.119 +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d); 33.120 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 33.121 + u32 type, 33.122 + struct domain *d); 33.123 + 33.124 +static void free_l2_table(struct pfn_info *page); 33.125 +static void free_l1_table(struct pfn_info *page); 33.126 + 33.127 +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); 33.128 +static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); 33.129 + 33.130 +/* Used to defer flushing of memory structures. */ 33.131 +static struct { 33.132 +#define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */ 33.133 +#define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */ 33.134 + unsigned long deferred_ops; 33.135 + /* If non-NULL, specifies a foreign subject domain for some operations. */ 33.136 + struct domain *foreign; 33.137 +} __cacheline_aligned percpu_info[NR_CPUS]; 33.138 + 33.139 +/* 33.140 + * Returns the current foreign domain; defaults to the currently-executing 33.141 + * domain if a foreign override hasn't been specified. 33.142 + */ 33.143 +#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain) 33.144 + 33.145 +/* Private domain structs for DOMID_XEN and DOMID_IO. */ 33.146 +static struct domain *dom_xen, *dom_io; 33.147 + 33.148 +/* Frame table and its size in pages. */ 33.149 +struct pfn_info *frame_table; 33.150 +unsigned long frame_table_size; 33.151 +unsigned long max_page; 33.152 + 33.153 +void __init init_frametable(void) 33.154 +{ 33.155 + unsigned long i, p; 33.156 + 33.157 + frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START; 33.158 + frame_table_size = max_page * sizeof(struct pfn_info); 33.159 + frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; 33.160 + 33.161 + for ( i = 0; i < frame_table_size; i += (4UL << 20) ) 33.162 + { 33.163 + p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20); 33.164 + if ( p == 0 ) 33.165 + panic("Not enough memory for frame table\n"); 33.166 + map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 33.167 + 4UL << 20, PAGE_HYPERVISOR); 33.168 + } 33.169 + 33.170 + memset(frame_table, 0, frame_table_size); 33.171 +} 33.172 + 33.173 +void arch_init_memory(void) 33.174 +{ 33.175 + extern void subarch_init_memory(struct domain *); 33.176 + 33.177 + memset(percpu_info, 0, sizeof(percpu_info)); 33.178 + 33.179 + /* 33.180 + * Initialise our DOMID_XEN domain. 33.181 + * Any Xen-heap pages that we will allow to be mapped will have 33.182 + * their domain field set to dom_xen. 33.183 + */ 33.184 + dom_xen = alloc_domain_struct(); 33.185 + atomic_set(&dom_xen->refcnt, 1); 33.186 + dom_xen->id = DOMID_XEN; 33.187 + 33.188 + /* 33.189 + * Initialise our DOMID_IO domain. 33.190 + * This domain owns no pages but is considered a special case when 33.191 + * mapping I/O pages, as the mappings occur at the priv of the caller. 33.192 + */ 33.193 + dom_io = alloc_domain_struct(); 33.194 + atomic_set(&dom_io->refcnt, 1); 33.195 + dom_io->id = DOMID_IO; 33.196 + 33.197 + subarch_init_memory(dom_xen); 33.198 +} 33.199 + 33.200 +void write_ptbase(struct exec_domain *ed) 33.201 +{ 33.202 + struct domain *d = ed->domain; 33.203 + unsigned long pa; 33.204 + 33.205 +#ifdef CONFIG_VMX 33.206 + if ( unlikely(shadow_mode(d)) ) 33.207 + pa = ((shadow_mode(d) == SHM_full_32) ? 33.208 + pagetable_val(ed->arch.monitor_table) : 33.209 + pagetable_val(ed->arch.shadow_table)); 33.210 + else 33.211 + pa = pagetable_val(ed->arch.pagetable); 33.212 +#else 33.213 + if ( unlikely(shadow_mode(d)) ) 33.214 + pa = pagetable_val(ed->arch.shadow_table); 33.215 + else 33.216 + pa = pagetable_val(ed->arch.pagetable); 33.217 +#endif 33.218 + 33.219 + write_cr3(pa); 33.220 +} 33.221 + 33.222 +static void __invalidate_shadow_ldt(struct exec_domain *d) 33.223 +{ 33.224 + int i; 33.225 + unsigned long pfn; 33.226 + struct pfn_info *page; 33.227 + 33.228 + d->arch.shadow_ldt_mapcnt = 0; 33.229 + 33.230 + for ( i = 16; i < 32; i++ ) 33.231 + { 33.232 + pfn = l1_pgentry_to_pfn(d->arch.perdomain_ptes[i]); 33.233 + if ( pfn == 0 ) continue; 33.234 + d->arch.perdomain_ptes[i] = mk_l1_pgentry(0); 33.235 + page = &frame_table[pfn]; 33.236 + ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page); 33.237 + ASSERT_PAGE_IS_DOMAIN(page, d->domain); 33.238 + put_page_and_type(page); 33.239 + } 33.240 + 33.241 + /* Dispose of the (now possibly invalid) mappings from the TLB. */ 33.242 + percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT; 33.243 +} 33.244 + 33.245 + 33.246 +static inline void invalidate_shadow_ldt(struct exec_domain *d) 33.247 +{ 33.248 + if ( d->arch.shadow_ldt_mapcnt != 0 ) 33.249 + __invalidate_shadow_ldt(d); 33.250 +} 33.251 + 33.252 + 33.253 +static int alloc_segdesc_page(struct pfn_info *page) 33.254 +{ 33.255 + struct desc_struct *descs; 33.256 + int i; 33.257 + 33.258 + descs = map_domain_mem((page-frame_table) << PAGE_SHIFT); 33.259 + 33.260 + for ( i = 0; i < 512; i++ ) 33.261 + if ( unlikely(!check_descriptor(&descs[i])) ) 33.262 + goto fail; 33.263 + 33.264 + unmap_domain_mem(descs); 33.265 + return 1; 33.266 + 33.267 + fail: 33.268 + unmap_domain_mem(descs); 33.269 + return 0; 33.270 +} 33.271 + 33.272 + 33.273 +/* Map shadow page at offset @off. */ 33.274 +int map_ldt_shadow_page(unsigned int off) 33.275 +{ 33.276 + struct exec_domain *ed = current; 33.277 + struct domain *d = ed->domain; 33.278 + unsigned long l1e; 33.279 + 33.280 + if ( unlikely(in_irq()) ) 33.281 + BUG(); 33.282 + 33.283 + __get_user(l1e, (unsigned long *) 33.284 + &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]); 33.285 + 33.286 + if ( unlikely(!(l1e & _PAGE_PRESENT)) || 33.287 + unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 33.288 + d, PGT_ldt_page)) ) 33.289 + return 0; 33.290 + 33.291 + ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW); 33.292 + ed->arch.shadow_ldt_mapcnt++; 33.293 + 33.294 + return 1; 33.295 +} 33.296 + 33.297 + 33.298 +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d) 33.299 +{ 33.300 + struct pfn_info *page = &frame_table[page_nr]; 33.301 + 33.302 + if ( unlikely(!pfn_is_ram(page_nr)) ) 33.303 + { 33.304 + MEM_LOG("Pfn %08lx is not RAM", page_nr); 33.305 + return 0; 33.306 + } 33.307 + 33.308 + if ( unlikely(!get_page(page, d)) ) 33.309 + { 33.310 + MEM_LOG("Could not get page ref for pfn %08lx", page_nr); 33.311 + return 0; 33.312 + } 33.313 + 33.314 + return 1; 33.315 +} 33.316 + 33.317 + 33.318 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 33.319 + u32 type, 33.320 + struct domain *d) 33.321 +{ 33.322 + struct pfn_info *page = &frame_table[page_nr]; 33.323 + 33.324 + if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) 33.325 + return 0; 33.326 + 33.327 + if ( unlikely(!get_page_type(page, type)) ) 33.328 + { 33.329 +#ifdef VERBOSE 33.330 + if ( (type & PGT_type_mask) != PGT_l1_page_table ) 33.331 + MEM_LOG("Bad page type for pfn %08lx (%08x)", 33.332 + page_nr, page->u.inuse.type_info); 33.333 +#endif 33.334 + put_page(page); 33.335 + return 0; 33.336 + } 33.337 + 33.338 + return 1; 33.339 +} 33.340 + 33.341 + 33.342 +/* 33.343 + * We allow an L2 tables to map each other (a.k.a. linear page tables). It 33.344 + * needs some special care with reference counst and access permissions: 33.345 + * 1. The mapping entry must be read-only, or the guest may get write access 33.346 + * to its own PTEs. 33.347 + * 2. We must only bump the reference counts for an *already validated* 33.348 + * L2 table, or we can end up in a deadlock in get_page_type() by waiting 33.349 + * on a validation that is required to complete that validation. 33.350 + * 3. We only need to increment the reference counts for the mapped page 33.351 + * frame if it is mapped by a different L2 table. This is sufficient and 33.352 + * also necessary to allow validation of an L2 table mapping itself. 33.353 + */ 33.354 +static int 33.355 +get_linear_pagetable( 33.356 + l2_pgentry_t l2e, unsigned long pfn, struct domain *d) 33.357 +{ 33.358 + u32 x, y; 33.359 + struct pfn_info *page; 33.360 + 33.361 + if ( (l2_pgentry_val(l2e) & _PAGE_RW) ) 33.362 + { 33.363 + MEM_LOG("Attempt to create linear p.t. with write perms"); 33.364 + return 0; 33.365 + } 33.366 + 33.367 + if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) 33.368 + { 33.369 + /* Make sure the mapped frame belongs to the correct domain. */ 33.370 + if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pfn(l2e), d)) ) 33.371 + return 0; 33.372 + 33.373 + /* 33.374 + * Make sure that the mapped frame is an already-validated L2 table. 33.375 + * If so, atomically increment the count (checking for overflow). 33.376 + */ 33.377 + page = &frame_table[l2_pgentry_to_pfn(l2e)]; 33.378 + y = page->u.inuse.type_info; 33.379 + do { 33.380 + x = y; 33.381 + if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || 33.382 + unlikely((x & (PGT_type_mask|PGT_validated)) != 33.383 + (PGT_l2_page_table|PGT_validated)) ) 33.384 + { 33.385 + put_page(page); 33.386 + return 0; 33.387 + } 33.388 + } 33.389 + while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); 33.390 + } 33.391 + 33.392 + return 1; 33.393 +} 33.394 + 33.395 + 33.396 +static int 33.397 +get_page_from_l1e( 33.398 + l1_pgentry_t l1e, struct domain *d) 33.399 +{ 33.400 + unsigned long l1v = l1_pgentry_val(l1e); 33.401 + unsigned long pfn = l1_pgentry_to_pfn(l1e); 33.402 + struct pfn_info *page = &frame_table[pfn]; 33.403 + extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn); 33.404 + 33.405 + if ( !(l1v & _PAGE_PRESENT) ) 33.406 + return 1; 33.407 + 33.408 + if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) ) 33.409 + { 33.410 + MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT)); 33.411 + return 0; 33.412 + } 33.413 + 33.414 + if ( unlikely(!pfn_is_ram(pfn)) ) 33.415 + { 33.416 + /* Revert to caller privileges if FD == DOMID_IO. */ 33.417 + if ( d == dom_io ) 33.418 + d = current->domain; 33.419 + 33.420 + if ( IS_PRIV(d) ) 33.421 + return 1; 33.422 + 33.423 + if ( IS_CAPABLE_PHYSDEV(d) ) 33.424 + return domain_iomem_in_pfn(d, pfn); 33.425 + 33.426 + MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn); 33.427 + return 0; 33.428 + } 33.429 + 33.430 + return ((l1v & _PAGE_RW) ? 33.431 + get_page_and_type(page, d, PGT_writable_page) : 33.432 + get_page(page, d)); 33.433 +} 33.434 + 33.435 + 33.436 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ 33.437 +static int 33.438 +get_page_from_l2e( 33.439 + l2_pgentry_t l2e, unsigned long pfn, 33.440 + struct domain *d, unsigned long va_idx) 33.441 +{ 33.442 + int rc; 33.443 + 33.444 + if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) 33.445 + return 1; 33.446 + 33.447 + if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) ) 33.448 + { 33.449 + MEM_LOG("Bad L2 page type settings %04lx", 33.450 + l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE)); 33.451 + return 0; 33.452 + } 33.453 + 33.454 + rc = get_page_and_type_from_pagenr( 33.455 + l2_pgentry_to_pfn(l2e), 33.456 + PGT_l1_page_table | (va_idx<<PGT_va_shift), d); 33.457 + 33.458 + if ( unlikely(!rc) ) 33.459 + return get_linear_pagetable(l2e, pfn, d); 33.460 + 33.461 + return 1; 33.462 +} 33.463 + 33.464 + 33.465 +static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) 33.466 +{ 33.467 + unsigned long l1v = l1_pgentry_val(l1e); 33.468 + unsigned long pfn = l1_pgentry_to_pfn(l1e); 33.469 + struct pfn_info *page = &frame_table[pfn]; 33.470 + struct domain *e; 33.471 + 33.472 + if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) ) 33.473 + return; 33.474 + 33.475 + e = page_get_owner(page); 33.476 + if ( unlikely(e != d) ) 33.477 + { 33.478 + /* 33.479 + * Unmap a foreign page that may have been mapped via a grant table. 33.480 + * Note that this can fail for a privileged domain that can map foreign 33.481 + * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings 33.482 + * counted via a grant entry and some counted directly in the page 33.483 + * structure's reference count. Note that reference counts won't get 33.484 + * dangerously confused as long as we always try to decrement the 33.485 + * grant entry first. We may end up with a mismatch between which 33.486 + * mappings and which unmappings are counted via the grant entry, but 33.487 + * really it doesn't matter as privileged domains have carte blanche. 33.488 + */ 33.489 + if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) ) 33.490 + return; 33.491 + /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */ 33.492 + } 33.493 + 33.494 + if ( l1v & _PAGE_RW ) 33.495 + { 33.496 + put_page_and_type(page); 33.497 + } 33.498 + else 33.499 + { 33.500 + /* We expect this is rare so we blow the entire shadow LDT. */ 33.501 + if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 33.502 + PGT_ldt_page)) && 33.503 + unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) ) 33.504 + invalidate_shadow_ldt(e->exec_domain[0]); 33.505 + put_page(page); 33.506 + } 33.507 +} 33.508 + 33.509 + 33.510 +/* 33.511 + * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. 33.512 + * Note also that this automatically deals correctly with linear p.t.'s. 33.513 + */ 33.514 +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) 33.515 +{ 33.516 + if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 33.517 + ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) ) 33.518 + put_page_and_type(&frame_table[l2_pgentry_to_pfn(l2e)]); 33.519 +} 33.520 + 33.521 + 33.522 +static int alloc_l2_table(struct pfn_info *page) 33.523 +{ 33.524 + struct domain *d = page_get_owner(page); 33.525 + unsigned long page_nr = page_to_pfn(page); 33.526 + l2_pgentry_t *pl2e; 33.527 + int i; 33.528 + 33.529 + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 33.530 + 33.531 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 33.532 + if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) ) 33.533 + goto fail; 33.534 + 33.535 +#if defined(__i386__) 33.536 + /* Now we add our private high mappings. */ 33.537 + memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 33.538 + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 33.539 + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); 33.540 + pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = 33.541 + mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); 33.542 + pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = 33.543 + mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 33.544 + __PAGE_HYPERVISOR); 33.545 +#endif 33.546 + 33.547 + unmap_domain_mem(pl2e); 33.548 + return 1; 33.549 + 33.550 + fail: 33.551 + while ( i-- > 0 ) 33.552 + put_page_from_l2e(pl2e[i], page_nr); 33.553 + 33.554 + unmap_domain_mem(pl2e); 33.555 + return 0; 33.556 +} 33.557 + 33.558 + 33.559 +static int alloc_l1_table(struct pfn_info *page) 33.560 +{ 33.561 + struct domain *d = page_get_owner(page); 33.562 + unsigned long page_nr = page_to_pfn(page); 33.563 + l1_pgentry_t *pl1e; 33.564 + int i; 33.565 + 33.566 + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 33.567 + 33.568 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 33.569 + if ( unlikely(!get_page_from_l1e(pl1e[i], d)) ) 33.570 + goto fail; 33.571 + 33.572 + unmap_domain_mem(pl1e); 33.573 + return 1; 33.574 + 33.575 + fail: 33.576 + while ( i-- > 0 ) 33.577 + put_page_from_l1e(pl1e[i], d); 33.578 + 33.579 + unmap_domain_mem(pl1e); 33.580 + return 0; 33.581 +} 33.582 + 33.583 + 33.584 +static void free_l2_table(struct pfn_info *page) 33.585 +{ 33.586 + unsigned long page_nr = page - frame_table; 33.587 + l2_pgentry_t *pl2e; 33.588 + int i; 33.589 + 33.590 + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 33.591 + 33.592 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 33.593 + put_page_from_l2e(pl2e[i], page_nr); 33.594 + 33.595 + unmap_domain_mem(pl2e); 33.596 +} 33.597 + 33.598 + 33.599 +static void free_l1_table(struct pfn_info *page) 33.600 +{ 33.601 + struct domain *d = page_get_owner(page); 33.602 + unsigned long page_nr = page - frame_table; 33.603 + l1_pgentry_t *pl1e; 33.604 + int i; 33.605 + 33.606 + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 33.607 + 33.608 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 33.609 + put_page_from_l1e(pl1e[i], d); 33.610 + 33.611 + unmap_domain_mem(pl1e); 33.612 +} 33.613 + 33.614 + 33.615 +static inline int update_l2e(l2_pgentry_t *pl2e, 33.616 + l2_pgentry_t ol2e, 33.617 + l2_pgentry_t nl2e) 33.618 +{ 33.619 + unsigned long o = cmpxchg((unsigned long *)pl2e, 33.620 + l2_pgentry_val(ol2e), 33.621 + l2_pgentry_val(nl2e)); 33.622 + if ( o != l2_pgentry_val(ol2e) ) 33.623 + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 33.624 + l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o); 33.625 + return (o == l2_pgentry_val(ol2e)); 33.626 +} 33.627 + 33.628 + 33.629 +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ 33.630 +static int mod_l2_entry(l2_pgentry_t *pl2e, 33.631 + l2_pgentry_t nl2e, 33.632 + unsigned long pfn) 33.633 +{ 33.634 + l2_pgentry_t ol2e; 33.635 + unsigned long _ol2e; 33.636 + 33.637 + if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >= 33.638 + DOMAIN_ENTRIES_PER_L2_PAGETABLE) ) 33.639 + { 33.640 + MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e); 33.641 + return 0; 33.642 + } 33.643 + 33.644 + if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) ) 33.645 + return 0; 33.646 + ol2e = mk_l2_pgentry(_ol2e); 33.647 + 33.648 + if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT ) 33.649 + { 33.650 + /* Differ in mapping (bits 12-31) or presence (bit 0)? */ 33.651 + if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 ) 33.652 + return update_l2e(pl2e, ol2e, nl2e); 33.653 + 33.654 + if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, 33.655 + ((unsigned long)pl2e & 33.656 + ~PAGE_MASK) >> 2)) ) 33.657 + return 0; 33.658 + 33.659 + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 33.660 + { 33.661 + put_page_from_l2e(nl2e, pfn); 33.662 + return 0; 33.663 + } 33.664 + 33.665 + put_page_from_l2e(ol2e, pfn); 33.666 + return 1; 33.667 + } 33.668 + 33.669 + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 33.670 + return 0; 33.671 + 33.672 + put_page_from_l2e(ol2e, pfn); 33.673 + return 1; 33.674 +} 33.675 + 33.676 + 33.677 +static inline int update_l1e(l1_pgentry_t *pl1e, 33.678 + l1_pgentry_t ol1e, 33.679 + l1_pgentry_t nl1e) 33.680 +{ 33.681 + unsigned long o = l1_pgentry_val(ol1e); 33.682 + unsigned long n = l1_pgentry_val(nl1e); 33.683 + 33.684 + if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) || 33.685 + unlikely(o != l1_pgentry_val(ol1e)) ) 33.686 + { 33.687 + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 33.688 + l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o); 33.689 + return 0; 33.690 + } 33.691 + 33.692 + return 1; 33.693 +} 33.694 + 33.695 + 33.696 +/* Update the L1 entry at pl1e to new value nl1e. */ 33.697 +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) 33.698 +{ 33.699 + l1_pgentry_t ol1e; 33.700 + unsigned long _ol1e; 33.701 + struct domain *d = current->domain; 33.702 + 33.703 + if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) 33.704 + { 33.705 + MEM_LOG("Bad get_user\n"); 33.706 + return 0; 33.707 + } 33.708 + 33.709 + ol1e = mk_l1_pgentry(_ol1e); 33.710 + 33.711 + if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT ) 33.712 + { 33.713 + /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */ 33.714 + if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 ) 33.715 + return update_l1e(pl1e, ol1e, nl1e); 33.716 + 33.717 + if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) 33.718 + return 0; 33.719 + 33.720 + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 33.721 + { 33.722 + put_page_from_l1e(nl1e, d); 33.723 + return 0; 33.724 + } 33.725 + 33.726 + put_page_from_l1e(ol1e, d); 33.727 + return 1; 33.728 + } 33.729 + 33.730 + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 33.731 + return 0; 33.732 + 33.733 + put_page_from_l1e(ol1e, d); 33.734 + return 1; 33.735 +} 33.736 + 33.737 + 33.738 +int alloc_page_type(struct pfn_info *page, unsigned int type) 33.739 +{ 33.740 + switch ( type ) 33.741 + { 33.742 + case PGT_l1_page_table: 33.743 + return alloc_l1_table(page); 33.744 + case PGT_l2_page_table: 33.745 + return alloc_l2_table(page); 33.746 + case PGT_gdt_page: 33.747 + case PGT_ldt_page: 33.748 + return alloc_segdesc_page(page); 33.749 + default: 33.750 + printk("Bad type in alloc_page_type %x t=%x c=%x\n", 33.751 + type, page->u.inuse.type_info, 33.752 + page->count_info); 33.753 + BUG(); 33.754 + } 33.755 + 33.756 + return 0; 33.757 +} 33.758 + 33.759 + 33.760 +void free_page_type(struct pfn_info *page, unsigned int type) 33.761 +{ 33.762 + struct domain *d = page_get_owner(page); 33.763 + 33.764 + switch ( type ) 33.765 + { 33.766 + case PGT_l1_page_table: 33.767 + free_l1_table(page); 33.768 + break; 33.769 + 33.770 + case PGT_l2_page_table: 33.771 + free_l2_table(page); 33.772 + break; 33.773 + 33.774 + default: 33.775 + BUG(); 33.776 + } 33.777 + 33.778 + if ( unlikely(shadow_mode(d)) && 33.779 + (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) ) 33.780 + { 33.781 + unshadow_table(page_to_pfn(page), type); 33.782 + put_shadow_status(d); 33.783 + } 33.784 +} 33.785 + 33.786 + 33.787 +void put_page_type(struct pfn_info *page) 33.788 +{ 33.789 + u32 nx, x, y = page->u.inuse.type_info; 33.790 + 33.791 + again: 33.792 + do { 33.793 + x = y; 33.794 + nx = x - 1; 33.795 + 33.796 + ASSERT((x & PGT_count_mask) != 0); 33.797 + 33.798 + /* 33.799 + * The page should always be validated while a reference is held. The 33.800 + * exception is during domain destruction, when we forcibly invalidate 33.801 + * page-table pages if we detect a referential loop. 33.802 + * See domain.c:relinquish_list(). 33.803 + */ 33.804 + ASSERT((x & PGT_validated) || 33.805 + test_bit(DF_DYING, &page_get_owner(page)->d_flags)); 33.806 + 33.807 + if ( unlikely((nx & PGT_count_mask) == 0) ) 33.808 + { 33.809 + /* Record TLB information for flush later. Races are harmless. */ 33.810 + page->tlbflush_timestamp = tlbflush_current_time(); 33.811 + 33.812 + if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && 33.813 + likely(nx & PGT_validated) ) 33.814 + { 33.815 + /* 33.816 + * Page-table pages must be unvalidated when count is zero. The 33.817 + * 'free' is safe because the refcnt is non-zero and validated 33.818 + * bit is clear => other ops will spin or fail. 33.819 + */ 33.820 + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 33.821 + x & ~PGT_validated)) != x) ) 33.822 + goto again; 33.823 + /* We cleared the 'valid bit' so we do the clear up. */ 33.824 + free_page_type(page, x & PGT_type_mask); 33.825 + /* Carry on, but with the 'valid bit' now clear. */ 33.826 + x &= ~PGT_validated; 33.827 + nx &= ~PGT_validated; 33.828 + } 33.829 + } 33.830 + else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 33.831 + (PGT_pinned | 1)) ) 33.832 + { 33.833 + /* Page is now only pinned. Make the back pointer mutable again. */ 33.834 + nx |= PGT_va_mutable; 33.835 + } 33.836 + } 33.837 + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 33.838 +} 33.839 + 33.840 + 33.841 +int get_page_type(struct pfn_info *page, u32 type) 33.842 +{ 33.843 + u32 nx, x, y = page->u.inuse.type_info; 33.844 + 33.845 + again: 33.846 + do { 33.847 + x = y; 33.848 + nx = x + 1; 33.849 + if ( unlikely((nx & PGT_count_mask) == 0) ) 33.850 + { 33.851 + MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page)); 33.852 + return 0; 33.853 + } 33.854 + else if ( unlikely((x & PGT_count_mask) == 0) ) 33.855 + { 33.856 + if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) 33.857 + { 33.858 + /* 33.859 + * On type change we check to flush stale TLB entries. This 33.860 + * may be unnecessary (e.g., page was GDT/LDT) but those 33.861 + * circumstances should be very rare. 33.862 + */ 33.863 + struct domain *d = page_get_owner(page); 33.864 + if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor], 33.865 + page->tlbflush_timestamp)) ) 33.866 + { 33.867 + perfc_incr(need_flush_tlb_flush); 33.868 + flush_tlb_cpu(d->exec_domain[0]->processor); 33.869 + } 33.870 + 33.871 + /* We lose existing type, back pointer, and validity. */ 33.872 + nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); 33.873 + nx |= type; 33.874 + 33.875 + /* No special validation needed for writable pages. */ 33.876 + /* Page tables and GDT/LDT need to be scanned for validity. */ 33.877 + if ( type == PGT_writable_page ) 33.878 + nx |= PGT_validated; 33.879 + } 33.880 + } 33.881 + else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) 33.882 + { 33.883 + if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) 33.884 + { 33.885 + if ( ((x & PGT_type_mask) != PGT_l2_page_table) || 33.886 + ((type & PGT_type_mask) != PGT_l1_page_table) ) 33.887 + MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n", 33.888 + x & PGT_type_mask, type, page_to_pfn(page)); 33.889 + return 0; 33.890 + } 33.891 + else if ( (x & PGT_va_mask) == PGT_va_mutable ) 33.892 + { 33.893 + /* The va backpointer is mutable, hence we update it. */ 33.894 + nx &= ~PGT_va_mask; 33.895 + nx |= type; /* we know the actual type is correct */ 33.896 + } 33.897 + else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) ) 33.898 + { 33.899 + /* This table is potentially mapped at multiple locations. */ 33.900 + nx &= ~PGT_va_mask; 33.901 + nx |= PGT_va_unknown; 33.902 + } 33.903 + } 33.904 + else if ( unlikely(!(x & PGT_validated)) ) 33.905 + { 33.906 + /* Someone else is updating validation of this page. Wait... */ 33.907 + while ( (y = page->u.inuse.type_info) == x ) 33.908 + { 33.909 + rep_nop(); 33.910 + barrier(); 33.911 + } 33.912 + goto again; 33.913 + } 33.914 + } 33.915 + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 33.916 + 33.917 + if ( unlikely(!(nx & PGT_validated)) ) 33.918 + { 33.919 + /* Try to validate page type; drop the new reference on failure. */ 33.920 + if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) 33.921 + { 33.922 + MEM_LOG("Error while validating pfn %08lx for type %08x." 33.923 + " caf=%08x taf=%08x\n", 33.924 + page_to_pfn(page), type, 33.925 + page->count_info, 33.926 + page->u.inuse.type_info); 33.927 + /* Noone else can get a reference. We hold the only ref. */ 33.928 + page->u.inuse.type_info = 0; 33.929 + return 0; 33.930 + } 33.931 + 33.932 + /* Noone else is updating simultaneously. */ 33.933 + __set_bit(_PGT_validated, &page->u.inuse.type_info); 33.934 + } 33.935 + 33.936 + return 1; 33.937 +} 33.938 + 33.939 + 33.940 +int new_guest_cr3(unsigned long pfn) 33.941 +{ 33.942 + struct exec_domain *ed = current; 33.943 + struct domain *d = ed->domain; 33.944 + int okay, cpu = smp_processor_id(); 33.945 + unsigned long old_base_pfn; 33.946 + 33.947 + okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d); 33.948 + if ( likely(okay) ) 33.949 + { 33.950 + invalidate_shadow_ldt(ed); 33.951 + 33.952 + percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB; 33.953 + old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT; 33.954 + ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT); 33.955 + 33.956 + shadow_mk_pagetable(ed); 33.957 + 33.958 + write_ptbase(ed); 33.959 + 33.960 + put_page_and_type(&frame_table[old_base_pfn]); 33.961 + } 33.962 + else 33.963 + { 33.964 + MEM_LOG("Error while installing new baseptr %08lx", pfn); 33.965 + } 33.966 + 33.967 + return okay; 33.968 +} 33.969 + 33.970 +static int do_extended_command(unsigned long ptr, unsigned long val) 33.971 +{ 33.972 + int okay = 1, cpu = smp_processor_id(); 33.973 + unsigned int cmd = val & MMUEXT_CMD_MASK; 33.974 + unsigned long pfn = ptr >> PAGE_SHIFT; 33.975 + struct pfn_info *page = &frame_table[pfn]; 33.976 + struct exec_domain *ed = current; 33.977 + struct domain *d = ed->domain, *nd, *e; 33.978 + u32 x, y; 33.979 + domid_t domid; 33.980 + grant_ref_t gntref; 33.981 + 33.982 + switch ( cmd ) 33.983 + { 33.984 + case MMUEXT_PIN_L1_TABLE: 33.985 + case MMUEXT_PIN_L2_TABLE: 33.986 + /* 33.987 + * We insist that, if you pin an L1 page, it's the first thing that 33.988 + * you do to it. This is because we require the backptr to still be 33.989 + * mutable. This assumption seems safe. 33.990 + */ 33.991 + okay = get_page_and_type_from_pagenr( 33.992 + pfn, 33.993 + ((cmd==MMUEXT_PIN_L2_TABLE) ? 33.994 + PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)), 33.995 + FOREIGNDOM); 33.996 + 33.997 + if ( unlikely(!okay) ) 33.998 + { 33.999 + MEM_LOG("Error while pinning pfn %08lx", pfn); 33.1000 + break; 33.1001 + } 33.1002 + 33.1003 + if ( unlikely(test_and_set_bit(_PGT_pinned, 33.1004 + &page->u.inuse.type_info)) ) 33.1005 + { 33.1006 + MEM_LOG("Pfn %08lx already pinned", pfn); 33.1007 + put_page_and_type(page); 33.1008 + okay = 0; 33.1009 + break; 33.1010 + } 33.1011 + 33.1012 + break; 33.1013 + 33.1014 + case MMUEXT_UNPIN_TABLE: 33.1015 + if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) ) 33.1016 + { 33.1017 + MEM_LOG("Page %08lx bad domain (dom=%p)", 33.1018 + ptr, page_get_owner(page)); 33.1019 + } 33.1020 + else if ( likely(test_and_clear_bit(_PGT_pinned, 33.1021 + &page->u.inuse.type_info)) ) 33.1022 + { 33.1023 + put_page_and_type(page); 33.1024 + put_page(page); 33.1025 + } 33.1026 + else 33.1027 + { 33.1028 + okay = 0; 33.1029 + put_page(page); 33.1030 + MEM_LOG("Pfn %08lx not pinned", pfn); 33.1031 + } 33.1032 + break; 33.1033 + 33.1034 + case MMUEXT_NEW_BASEPTR: 33.1035 + okay = new_guest_cr3(pfn); 33.1036 + break; 33.1037 + 33.1038 + case MMUEXT_TLB_FLUSH: 33.1039 + percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB; 33.1040 + break; 33.1041 + 33.1042 + case MMUEXT_INVLPG: 33.1043 + __flush_tlb_one(ptr); 33.1044 + break; 33.1045 + 33.1046 + case MMUEXT_FLUSH_CACHE: 33.1047 + if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) ) 33.1048 + { 33.1049 + MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n"); 33.1050 + okay = 0; 33.1051 + } 33.1052 + else 33.1053 + { 33.1054 + wbinvd(); 33.1055 + } 33.1056 + break; 33.1057 + 33.1058 + case MMUEXT_SET_LDT: 33.1059 + { 33.1060 + unsigned long ents = val >> MMUEXT_CMD_SHIFT; 33.1061 + if ( ((ptr & (PAGE_SIZE-1)) != 0) || 33.1062 + (ents > 8192) || 33.1063 + ((ptr+ents*LDT_ENTRY_SIZE) < ptr) || 33.1064 + ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) ) 33.1065 + { 33.1066 + okay = 0; 33.1067 + MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents); 33.1068 + } 33.1069 + else if ( (ed->arch.ldt_ents != ents) || 33.1070 + (ed->arch.ldt_base != ptr) ) 33.1071 + { 33.1072 + invalidate_shadow_ldt(ed); 33.1073 + ed->arch.ldt_base = ptr; 33.1074 + ed->arch.ldt_ents = ents; 33.1075 + load_LDT(ed); 33.1076 + percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT; 33.1077 + if ( ents != 0 ) 33.1078 + percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT; 33.1079 + } 33.1080 + break; 33.1081 + } 33.1082 + 33.1083 + case MMUEXT_SET_FOREIGNDOM: 33.1084 + domid = (domid_t)(val >> 16); 33.1085 + 33.1086 + if ( (e = percpu_info[cpu].foreign) != NULL ) 33.1087 + put_domain(e); 33.1088 + percpu_info[cpu].foreign = NULL; 33.1089 + 33.1090 + if ( !IS_PRIV(d) ) 33.1091 + { 33.1092 + switch ( domid ) 33.1093 + { 33.1094 + case DOMID_IO: 33.1095 + get_knownalive_domain(dom_io); 33.1096 + percpu_info[cpu].foreign = dom_io; 33.1097 + break; 33.1098 + default: 33.1099 + MEM_LOG("Dom %u cannot set foreign dom\n", d->id); 33.1100 + okay = 0; 33.1101 + break; 33.1102 + } 33.1103 + } 33.1104 + else 33.1105 + { 33.1106 + percpu_info[cpu].foreign = e = find_domain_by_id(domid); 33.1107 + if ( e == NULL ) 33.1108 + { 33.1109 + switch ( domid ) 33.1110 + { 33.1111 + case DOMID_XEN: 33.1112 + get_knownalive_domain(dom_xen); 33.1113 + percpu_info[cpu].foreign = dom_xen; 33.1114 + break; 33.1115 + case DOMID_IO: 33.1116 + get_knownalive_domain(dom_io); 33.1117 + percpu_info[cpu].foreign = dom_io; 33.1118 + break; 33.1119 + default: 33.1120 + MEM_LOG("Unknown domain '%u'", domid); 33.1121 + okay = 0; 33.1122 + break; 33.1123 + } 33.1124 + } 33.1125 + } 33.1126 + break; 33.1127 + 33.1128 + case MMUEXT_TRANSFER_PAGE: 33.1129 + domid = (domid_t)(val >> 16); 33.1130 + gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF)); 33.1131 + 33.1132 + if ( unlikely(IS_XEN_HEAP_FRAME(page)) || 33.1133 + unlikely(!pfn_is_ram(pfn)) || 33.1134 + unlikely((e = find_domain_by_id(domid)) == NULL) ) 33.1135 + { 33.1136 + MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid); 33.1137 + okay = 0; 33.1138 + break; 33.1139 + } 33.1140 + 33.1141 + spin_lock(&d->page_alloc_lock); 33.1142 + 33.1143 + /* 33.1144 + * The tricky bit: atomically release ownership while there is just one 33.1145 + * benign reference to the page (PGC_allocated). If that reference 33.1146 + * disappears then the deallocation routine will safely spin. 33.1147 + */ 33.1148 + nd = page_get_owner(page); 33.1149 + y = page->count_info; 33.1150 + do { 33.1151 + x = y; 33.1152 + if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 33.1153 + (1|PGC_allocated)) || 33.1154 + unlikely(nd != d) ) 33.1155 + { 33.1156 + MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," 33.1157 + " caf=%08x, taf=%08x\n", page_to_pfn(page), 33.1158 + d, d->id, nd, x, page->u.inuse.type_info); 33.1159 + spin_unlock(&d->page_alloc_lock); 33.1160 + put_domain(e); 33.1161 + return 0; 33.1162 + } 33.1163 + __asm__ __volatile__( 33.1164 + LOCK_PREFIX "cmpxchg8b %2" 33.1165 + : "=d" (nd), "=a" (y), 33.1166 + "=m" (*(volatile u64 *)(&page->count_info)) 33.1167 + : "0" (d), "1" (x), "c" (NULL), "b" (x) ); 33.1168 + } 33.1169 + while ( unlikely(nd != d) || unlikely(y != x) ); 33.1170 + 33.1171 + /* 33.1172 + * Unlink from 'd'. At least one reference remains (now anonymous), so 33.1173 + * noone else is spinning to try to delete this page from 'd'. 33.1174 + */ 33.1175 + d->tot_pages--; 33.1176 + list_del(&page->list); 33.1177 + 33.1178 + spin_unlock(&d->page_alloc_lock); 33.1179 + 33.1180 + spin_lock(&e->page_alloc_lock); 33.1181 + 33.1182 + /* 33.1183 + * Check that 'e' will accept the page and has reservation headroom. 33.1184 + * Also, a domain mustn't have PGC_allocated pages when it is dying. 33.1185 + */ 33.1186 + ASSERT(e->tot_pages <= e->max_pages); 33.1187 + if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || 33.1188 + unlikely(e->tot_pages == e->max_pages) || 33.1189 + unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) ) 33.1190 + { 33.1191 + MEM_LOG("Transferee has no reservation headroom (%d,%d), or " 33.1192 + "provided a bad grant ref, or is dying (%08lx).\n", 33.1193 + e->tot_pages, e->max_pages, e->d_flags); 33.1194 + spin_unlock(&e->page_alloc_lock); 33.1195 + put_domain(e); 33.1196 + okay = 0; 33.1197 + break; 33.1198 + } 33.1199 + 33.1200 + /* Okay, add the page to 'e'. */ 33.1201 + if ( unlikely(e->tot_pages++ == 0) ) 33.1202 + get_knownalive_domain(e); 33.1203 + list_add_tail(&page->list, &e->page_list); 33.1204 + page_set_owner(page, e); 33.1205 + 33.1206 + spin_unlock(&e->page_alloc_lock); 33.1207 + 33.1208 + /* Transfer is all done: tell the guest about its new page frame. */ 33.1209 + gnttab_notify_transfer(e, gntref, pfn); 33.1210 + 33.1211 + put_domain(e); 33.1212 + break; 33.1213 + 33.1214 + case MMUEXT_REASSIGN_PAGE: 33.1215 + if ( unlikely(!IS_PRIV(d)) ) 33.1216 + { 33.1217 + MEM_LOG("Dom %u has no reassignment priv", d->id); 33.1218 + okay = 0; 33.1219 + break; 33.1220 + } 33.1221 + 33.1222 + e = percpu_info[cpu].foreign; 33.1223 + if ( unlikely(e == NULL) ) 33.1224 + { 33.1225 + MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn); 33.1226 + okay = 0; 33.1227 + break; 33.1228 + } 33.1229 + 33.1230 + /* 33.1231 + * Grab both page_list locks, in order. This prevents the page from 33.1232 + * disappearing elsewhere while we modify the owner, and we'll need 33.1233 + * both locks if we're successful so that we can change lists. 33.1234 + */ 33.1235 + if ( d < e ) 33.1236 + { 33.1237 + spin_lock(&d->page_alloc_lock); 33.1238 + spin_lock(&e->page_alloc_lock); 33.1239 + } 33.1240 + else 33.1241 + { 33.1242 + spin_lock(&e->page_alloc_lock); 33.1243 + spin_lock(&d->page_alloc_lock); 33.1244 + } 33.1245 + 33.1246 + /* A domain shouldn't have PGC_allocated pages when it is dying. */ 33.1247 + if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || 33.1248 + unlikely(IS_XEN_HEAP_FRAME(page)) ) 33.1249 + { 33.1250 + MEM_LOG("Reassignment page is Xen heap, or dest dom is dying."); 33.1251 + okay = 0; 33.1252 + goto reassign_fail; 33.1253 + } 33.1254 + 33.1255 + /* 33.1256 + * The tricky bit: atomically change owner while there is just one 33.1257 + * benign reference to the page (PGC_allocated). If that reference 33.1258 + * disappears then the deallocation routine will safely spin. 33.1259 + */ 33.1260 + nd = page_get_owner(page); 33.1261 + y = page->count_info; 33.1262 + do { 33.1263 + x = y; 33.1264 + if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 33.1265 + (1|PGC_allocated)) || 33.1266 + unlikely(nd != d) ) 33.1267 + { 33.1268 + MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," 33.1269 + " caf=%08x, taf=%08x\n", page_to_pfn(page), 33.1270 + d, d->id, nd, x, page->u.inuse.type_info); 33.1271 + okay = 0; 33.1272 + goto reassign_fail; 33.1273 + } 33.1274 + __asm__ __volatile__( 33.1275 + LOCK_PREFIX "cmpxchg8b %3" 33.1276 + : "=d" (nd), "=a" (y), "=c" (e), 33.1277 + "=m" (*(volatile u64 *)(&page->count_info)) 33.1278 + : "0" (d), "1" (x), "c" (e), "b" (x) ); 33.1279 + } 33.1280 + while ( unlikely(nd != d) || unlikely(y != x) ); 33.1281 + 33.1282 + /* 33.1283 + * Unlink from 'd'. We transferred at least one reference to 'e', so 33.1284 + * noone else is spinning to try to delete this page from 'd'. 33.1285 + */ 33.1286 + d->tot_pages--; 33.1287 + list_del(&page->list); 33.1288 + 33.1289 + /* 33.1290 + * Add the page to 'e'. Someone may already have removed the last 33.1291 + * reference and want to remove the page from 'e'. However, we have 33.1292 + * the lock so they'll spin waiting for us. 33.1293 + */ 33.1294 + if ( unlikely(e->tot_pages++ == 0) ) 33.1295 + get_knownalive_domain(e); 33.1296 + list_add_tail(&page->list, &e->page_list); 33.1297 + 33.1298 + reassign_fail: 33.1299 + spin_unlock(&d->page_alloc_lock); 33.1300 + spin_unlock(&e->page_alloc_lock); 33.1301 + break; 33.1302 + 33.1303 + case MMUEXT_CLEAR_FOREIGNDOM: 33.1304 + if ( (e = percpu_info[cpu].foreign) != NULL ) 33.1305 + put_domain(e); 33.1306 + percpu_info[cpu].foreign = NULL; 33.1307 + break; 33.1308 + 33.1309 + default: 33.1310 + MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK); 33.1311 + okay = 0; 33.1312 + break; 33.1313 + } 33.1314 + 33.1315 + return okay; 33.1316 +} 33.1317 + 33.1318 +int do_mmu_update( 33.1319 + mmu_update_t *ureqs, unsigned int count, unsigned int *pdone) 33.1320 +{ 33.1321 +/* 33.1322 + * We steal the m.s.b. of the @count parameter to indicate whether this 33.1323 + * invocation of do_mmu_update() is resuming a previously preempted call. 33.1324 + * We steal the next 15 bits to remember the current FOREIGNDOM. 33.1325 + */ 33.1326 +#define MMU_UPDATE_PREEMPTED (~(~0U>>1)) 33.1327 +#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16) 33.1328 +#define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT) 33.1329 + 33.1330 + mmu_update_t req; 33.1331 + unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0; 33.1332 + struct pfn_info *page; 33.1333 + int rc = 0, okay = 1, i = 0, cpu = smp_processor_id(); 33.1334 + unsigned int cmd, done = 0; 33.1335 + unsigned long prev_smfn = 0; 33.1336 + l1_pgentry_t *prev_spl1e = 0; 33.1337 + struct exec_domain *ed = current; 33.1338 + struct domain *d = ed->domain; 33.1339 + u32 type_info; 33.1340 + domid_t domid; 33.1341 + 33.1342 + LOCK_BIGLOCK(d); 33.1343 + 33.1344 + cleanup_writable_pagetable(d); 33.1345 + 33.1346 + if ( unlikely(shadow_mode(d)) ) 33.1347 + check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */ 33.1348 + 33.1349 + /* 33.1350 + * If we are resuming after preemption, read how much work we have already 33.1351 + * done. This allows us to set the @done output parameter correctly. 33.1352 + * We also reset FOREIGNDOM here. 33.1353 + */ 33.1354 + if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) ) 33.1355 + { 33.1356 + if ( !(count & MMU_UPDATE_PREEMPTED) ) 33.1357 + { 33.1358 + /* Count overflow into private FOREIGNDOM field. */ 33.1359 + MEM_LOG("do_mmu_update count is too large"); 33.1360 + rc = -EINVAL; 33.1361 + goto out; 33.1362 + } 33.1363 + count &= ~MMU_UPDATE_PREEMPTED; 33.1364 + domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT; 33.1365 + count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK; 33.1366 + if ( unlikely(pdone != NULL) ) 33.1367 + (void)get_user(done, pdone); 33.1368 + if ( (domid != current->domain->id) && 33.1369 + !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) ) 33.1370 + { 33.1371 + rc = -EINVAL; 33.1372 + goto out; 33.1373 + } 33.1374 + } 33.1375 + 33.1376 + perfc_incrc(calls_to_mmu_update); 33.1377 + perfc_addc(num_page_updates, count); 33.1378 + 33.1379 + if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) ) 33.1380 + { 33.1381 + rc = -EFAULT; 33.1382 + goto out; 33.1383 + } 33.1384 + 33.1385 + for ( i = 0; i < count; i++ ) 33.1386 + { 33.1387 + if ( hypercall_preempt_check() ) 33.1388 + { 33.1389 + rc = hypercall3_create_continuation( 33.1390 + __HYPERVISOR_mmu_update, ureqs, 33.1391 + (count - i) | 33.1392 + (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 33.1393 + MMU_UPDATE_PREEMPTED, pdone); 33.1394 + break; 33.1395 + } 33.1396 + 33.1397 + if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) ) 33.1398 + { 33.1399 + MEM_LOG("Bad __copy_from_user"); 33.1400 + rc = -EFAULT; 33.1401 + break; 33.1402 + } 33.1403 + 33.1404 + cmd = req.ptr & (sizeof(l1_pgentry_t)-1); 33.1405 + pfn = req.ptr >> PAGE_SHIFT; 33.1406 + 33.1407 + okay = 0; 33.1408 + 33.1409 + switch ( cmd ) 33.1410 + { 33.1411 + /* 33.1412 + * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. 33.1413 + */ 33.1414 + case MMU_NORMAL_PT_UPDATE: 33.1415 + if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) ) 33.1416 + { 33.1417 + MEM_LOG("Could not get page for normal update"); 33.1418 + break; 33.1419 + } 33.1420 + 33.1421 + if ( likely(prev_pfn == pfn) ) 33.1422 + { 33.1423 + va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); 33.1424 + } 33.1425 + else 33.1426 + { 33.1427 + if ( prev_pfn != 0 ) 33.1428 + unmap_domain_mem((void *)va); 33.1429 + va = (unsigned long)map_domain_mem(req.ptr); 33.1430 + prev_pfn = pfn; 33.1431 + } 33.1432 + 33.1433 + page = &frame_table[pfn]; 33.1434 + switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) 33.1435 + { 33.1436 + case PGT_l1_page_table: 33.1437 + if ( likely(get_page_type( 33.1438 + page, type_info & (PGT_type_mask|PGT_va_mask))) ) 33.1439 + { 33.1440 + okay = mod_l1_entry((l1_pgentry_t *)va, 33.1441 + mk_l1_pgentry(req.val)); 33.1442 + 33.1443 + if ( unlikely(shadow_mode(d)) && okay && 33.1444 + (get_shadow_status(d, page-frame_table) & 33.1445 + PSH_shadowed) ) 33.1446 + { 33.1447 + shadow_l1_normal_pt_update( 33.1448 + req.ptr, req.val, &prev_smfn, &prev_spl1e); 33.1449 + put_shadow_status(d); 33.1450 + } 33.1451 + 33.1452 + put_page_type(page); 33.1453 + } 33.1454 + break; 33.1455 + case PGT_l2_page_table: 33.1456 + if ( likely(get_page_type(page, PGT_l2_page_table)) ) 33.1457 + { 33.1458 + okay = mod_l2_entry((l2_pgentry_t *)va, 33.1459 + mk_l2_pgentry(req.val), 33.1460 + pfn); 33.1461 + 33.1462 + if ( unlikely(shadow_mode(d)) && okay && 33.1463 + (get_shadow_status(d, page-frame_table) & 33.1464 + PSH_shadowed) ) 33.1465 + { 33.1466 + shadow_l2_normal_pt_update(req.ptr, req.val); 33.1467 + put_shadow_status(d); 33.1468 + } 33.1469 + 33.1470 + put_page_type(page); 33.1471 + } 33.1472 + break; 33.1473 + default: 33.1474 + if ( likely(get_page_type(page, PGT_writable_page)) ) 33.1475 + { 33.1476 + *(unsigned long *)va = req.val; 33.1477 + okay = 1; 33.1478 + put_page_type(page); 33.1479 + } 33.1480 + break; 33.1481 + } 33.1482 + 33.1483 + put_page(page); 33.1484 + break; 33.1485 + 33.1486 + case MMU_MACHPHYS_UPDATE: 33.1487 + if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) ) 33.1488 + { 33.1489 + MEM_LOG("Could not get page for mach->phys update"); 33.1490 + break; 33.1491 + } 33.1492 + 33.1493 + machine_to_phys_mapping[pfn] = req.val; 33.1494 + okay = 1; 33.1495 + 33.1496 + /* 33.1497 + * If in log-dirty mode, mark the corresponding pseudo-physical 33.1498 + * page as dirty. 33.1499 + */ 33.1500 + if ( unlikely(shadow_mode(d) == SHM_logdirty) && 33.1501 + mark_dirty(d, pfn) ) 33.1502 + d->arch.shadow_dirty_block_count++; 33.1503 + 33.1504 + put_page(&frame_table[pfn]); 33.1505 + break; 33.1506 + 33.1507 + /* 33.1508 + * MMU_EXTENDED_COMMAND: Extended command is specified 33.1509 + * in the least-siginificant bits of the 'value' field. 33.1510 + */ 33.1511 + case MMU_EXTENDED_COMMAND: 33.1512 + req.ptr &= ~(sizeof(l1_pgentry_t) - 1); 33.1513 + okay = do_extended_command(req.ptr, req.val); 33.1514 + break; 33.1515 + 33.1516 + default: 33.1517 + MEM_LOG("Invalid page update command %08lx", req.ptr); 33.1518 + break; 33.1519 + } 33.1520 + 33.1521 + if ( unlikely(!okay) ) 33.1522 + { 33.1523 + rc = -EINVAL; 33.1524 + break; 33.1525 + } 33.1526 + 33.1527 + ureqs++; 33.1528 + } 33.1529 + 33.1530 + out: 33.1531 + if ( prev_pfn != 0 ) 33.1532 + unmap_domain_mem((void *)va); 33.1533 + 33.1534 + if ( unlikely(prev_spl1e != 0) ) 33.1535 + unmap_domain_mem((void *)prev_spl1e); 33.1536 + 33.1537 + deferred_ops = percpu_info[cpu].deferred_ops; 33.1538 + percpu_info[cpu].deferred_ops = 0; 33.1539 + 33.1540 + if ( deferred_ops & DOP_FLUSH_TLB ) 33.1541 + local_flush_tlb(); 33.1542 + 33.1543 + if ( deferred_ops & DOP_RELOAD_LDT ) 33.1544 + (void)map_ldt_shadow_page(0); 33.1545 + 33.1546 + if ( unlikely(percpu_info[cpu].foreign != NULL) ) 33.1547 + { 33.1548 + put_domain(percpu_info[cpu].foreign); 33.1549 + percpu_info[cpu].foreign = NULL; 33.1550 + } 33.1551 + 33.1552 + /* Add incremental work we have done to the @done output parameter. */ 33.1553 + if ( unlikely(pdone != NULL) ) 33.1554 + __put_user(done + i, pdone); 33.1555 + 33.1556 + if ( unlikely(shadow_mode(d)) ) 33.1557 + check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */ 33.1558 + 33.1559 + UNLOCK_BIGLOCK(d); 33.1560 + return rc; 33.1561 +} 33.1562 + 33.1563 + 33.1564 +int do_update_va_mapping(unsigned long va, 33.1565 + unsigned long val, 33.1566 + unsigned long flags) 33.1567 +{ 33.1568 + struct exec_domain *ed = current; 33.1569 + struct domain *d = ed->domain; 33.1570 + int err = 0; 33.1571 + unsigned int cpu = ed->processor; 33.1572 + unsigned long deferred_ops; 33.1573 + 33.1574 + perfc_incrc(calls_to_update_va); 33.1575 + 33.1576 + if ( unlikely(!__addr_ok(va)) ) 33.1577 + return -EINVAL; 33.1578 + 33.1579 + LOCK_BIGLOCK(d); 33.1580 + 33.1581 + cleanup_writable_pagetable(d); 33.1582 + 33.1583 + /* 33.1584 + * XXX When we make this support 4MB superpages we should also deal with 33.1585 + * the case of updating L2 entries. 33.1586 + */ 33.1587 + 33.1588 + if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)], 33.1589 + mk_l1_pgentry(val))) ) 33.1590 + err = -EINVAL; 33.1591 + 33.1592 + if ( unlikely(shadow_mode(d)) ) 33.1593 + { 33.1594 + unsigned long sval = 0; 33.1595 + 33.1596 + l1pte_propagate_from_guest(d, &val, &sval); 33.1597 + 33.1598 + if ( unlikely(__put_user(sval, ((unsigned long *)( 33.1599 + &shadow_linear_pg_table[l1_linear_offset(va)])))) ) 33.1600 + { 33.1601 + /* 33.1602 + * Since L2's are guranteed RW, failure indicates the page was not 33.1603 + * shadowed, so ignore. 33.1604 + */ 33.1605 + perfc_incrc(shadow_update_va_fail); 33.1606 + } 33.1607 + 33.1608 + /* 33.1609 + * If we're in log-dirty mode then we need to note that we've updated 33.1610 + * the PTE in the PT-holding page. We need the machine frame number 33.1611 + * for this. 33.1612 + */ 33.1613 + if ( shadow_mode(d) == SHM_logdirty ) 33.1614 + mark_dirty(d, va_to_l1mfn(va)); 33.1615 + 33.1616 + check_pagetable(d, ed->arch.pagetable, "va"); /* debug */ 33.1617 + } 33.1618 + 33.1619 + deferred_ops = percpu_info[cpu].deferred_ops; 33.1620 + percpu_info[cpu].deferred_ops = 0; 33.1621 + 33.1622 + if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 33.1623 + unlikely(flags & UVMF_FLUSH_TLB) ) 33.1624 + local_flush_tlb(); 33.1625 + else if ( unlikely(flags & UVMF_INVLPG) ) 33.1626 + __flush_tlb_one(va); 33.1627 + 33.1628 + if ( unlikely(deferred_ops & DOP_RELOAD_LDT) ) 33.1629 + (void)map_ldt_shadow_page(0); 33.1630 + 33.1631 + UNLOCK_BIGLOCK(d); 33.1632 + 33.1633 + return err; 33.1634 +} 33.1635 + 33.1636 +int do_update_va_mapping_otherdomain(unsigned long va, 33.1637 + unsigned long val, 33.1638 + unsigned long flags, 33.1639 + domid_t domid) 33.1640 +{ 33.1641 + unsigned int cpu = smp_processor_id(); 33.1642 + struct domain *d; 33.1643 + int rc; 33.1644 + 33.1645 + if ( unlikely(!IS_PRIV(current->domain)) ) 33.1646 + return -EPERM; 33.1647 + 33.1648 + percpu_info[cpu].foreign = d = find_domain_by_id(domid); 33.1649 + if ( unlikely(d == NULL) ) 33.1650 + { 33.1651 + MEM_LOG("Unknown domain '%u'", domid); 33.1652 + return -ESRCH; 33.1653 + } 33.1654 + 33.1655 + rc = do_update_va_mapping(va, val, flags); 33.1656 + 33.1657 + put_domain(d); 33.1658 + percpu_info[cpu].foreign = NULL; 33.1659 + 33.1660 + return rc; 33.1661 +} 33.1662 + 33.1663 + 33.1664 + 33.1665 +/************************* 33.1666 + * Descriptor Tables 33.1667 + */ 33.1668 + 33.1669 +void destroy_gdt(struct exec_domain *ed) 33.1670 +{ 33.1671 + int i; 33.1672 + unsigned long pfn; 33.1673 + 33.1674 + for ( i = 0; i < 16; i++ ) 33.1675 + { 33.1676 + if ( (pfn = l1_pgentry_to_pfn(ed->arch.perdomain_ptes[i])) != 0 ) 33.1677 + put_page_and_type(&frame_table[pfn]); 33.1678 + ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0); 33.1679 + } 33.1680 +} 33.1681 + 33.1682 + 33.1683 +long set_gdt(struct exec_domain *ed, 33.1684 + unsigned long *frames, 33.1685 + unsigned int entries) 33.1686 +{ 33.1687 + struct domain *d = ed->domain; 33.1688 + /* NB. There are 512 8-byte entries per GDT page. */ 33.1689 + int i = 0, nr_pages = (entries + 511) / 512; 33.1690 + struct desc_struct *vgdt; 33.1691 + unsigned long pfn; 33.1692 + 33.1693 + /* Check the first page in the new GDT. */ 33.1694 + if ( (pfn = frames[0]) >= max_page ) 33.1695 + goto fail; 33.1696 + 33.1697 + /* The first page is special because Xen owns a range of entries in it. */ 33.1698 + if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 33.1699 + { 33.1700 + /* GDT checks failed: try zapping the Xen reserved entries. */ 33.1701 + if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) ) 33.1702 + goto fail; 33.1703 + vgdt = map_domain_mem(pfn << PAGE_SHIFT); 33.1704 + memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0, 33.1705 + NR_RESERVED_GDT_ENTRIES*8); 33.1706 + unmap_domain_mem(vgdt); 33.1707 + put_page_and_type(&frame_table[pfn]); 33.1708 + 33.1709 + /* Okay, we zapped the entries. Now try the GDT checks again. */ 33.1710 + if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 33.1711 + goto fail; 33.1712 + } 33.1713 + 33.1714 + /* Check the remaining pages in the new GDT. */ 33.1715 + for ( i = 1; i < nr_pages; i++ ) 33.1716 + if ( ((pfn = frames[i]) >= max_page) || 33.1717 + !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 33.1718 + goto fail; 33.1719 + 33.1720 + /* Copy reserved GDT entries to the new GDT. */ 33.1721 + vgdt = map_domain_mem(frames[0] << PAGE_SHIFT); 33.1722 + memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 33.1723 + gdt_table + FIRST_RESERVED_GDT_ENTRY, 33.1724 + NR_RESERVED_GDT_ENTRIES*8); 33.1725 + unmap_domain_mem(vgdt); 33.1726 + 33.1727 + /* Tear down the old GDT. */ 33.1728 + destroy_gdt(ed); 33.1729 + 33.1730 + /* Install the new GDT. */ 33.1731 + for ( i = 0; i < nr_pages; i++ ) 33.1732 + ed->arch.perdomain_ptes[i] = 33.1733 + mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); 33.1734 + 33.1735 + SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed)); 33.1736 + SET_GDT_ENTRIES(ed, entries); 33.1737 + 33.1738 + return 0; 33.1739 + 33.1740 + fail: 33.1741 + while ( i-- > 0 ) 33.1742 + put_page_and_type(&frame_table[frames[i]]); 33.1743 + return -EINVAL; 33.1744 +} 33.1745 + 33.1746 + 33.1747 +long do_set_gdt(unsigned long *frame_list, unsigned int entries) 33.1748 +{ 33.1749 + int nr_pages = (entries + 511) / 512; 33.1750 + unsigned long frames[16]; 33.1751 + long ret; 33.1752 + 33.1753 + if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 33.1754 + return -EINVAL; 33.1755 + 33.1756 + if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) ) 33.1757 + return -EFAULT; 33.1758 + 33.1759 + LOCK_BIGLOCK(current->domain); 33.1760 + 33.1761 + if ( (ret = set_gdt(current, frames, entries)) == 0 ) 33.1762 + { 33.1763 + local_flush_tlb(); 33.1764 + __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt)); 33.1765 + } 33.1766 + 33.1767 + UNLOCK_BIGLOCK(current->domain); 33.1768 + 33.1769 + return ret; 33.1770 +} 33.1771 + 33.1772 + 33.1773 +long do_update_descriptor( 33.1774 + unsigned long pa, unsigned long word1, unsigned long word2) 33.1775 +{ 33.1776 + unsigned long pfn = pa >> PAGE_SHIFT; 33.1777 + struct desc_struct *gdt_pent, d; 33.1778 + struct pfn_info *page; 33.1779 + struct exec_domain *ed; 33.1780 + long ret = -EINVAL; 33.1781 + 33.1782 + d.a = (u32)word1; 33.1783 + d.b = (u32)word2; 33.1784 + 33.1785 + LOCK_BIGLOCK(current->domain); 33.1786 + 33.1787 + if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) { 33.1788 + UNLOCK_BIGLOCK(current->domain); 33.1789 + return -EINVAL; 33.1790 + } 33.1791 + 33.1792 + page = &frame_table[pfn]; 33.1793 + if ( unlikely(!get_page(page, current->domain)) ) { 33.1794 + UNLOCK_BIGLOCK(current->domain); 33.1795 + return -EINVAL; 33.1796 + } 33.1797 + 33.1798 + /* Check if the given frame is in use in an unsafe context. */ 33.1799 + switch ( page->u.inuse.type_info & PGT_type_mask ) 33.1800 + { 33.1801 + case PGT_gdt_page: 33.1802 + /* Disallow updates of Xen-reserved descriptors in the current GDT. */ 33.1803 + for_each_exec_domain(current->domain, ed) { 33.1804 + if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == pfn) && 33.1805 + (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && 33.1806 + (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) 33.1807 + goto out; 33.1808 + } 33.1809 + if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) 33.1810 + goto out; 33.1811 + break; 33.1812 + case PGT_ldt_page: 33.1813 + if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) 33.1814 + goto out; 33.1815 + break; 33.1816 + default: 33.1817 + if ( unlikely(!get_page_type(page, PGT_writable_page)) ) 33.1818 + goto out; 33.1819 + break; 33.1820 + } 33.1821 + 33.1822 + /* All is good so make the update. */ 33.1823 + gdt_pent = map_domain_mem(pa); 33.1824 + memcpy(gdt_pent, &d, 8); 33.1825 + unmap_domain_mem(gdt_pent); 33.1826 + 33.1827 + put_page_type(page); 33.1828 + 33.1829 + ret = 0; /* success */ 33.1830 + 33.1831 + out: 33.1832 + put_page(page); 33.1833 + 33.1834 + UNLOCK_BIGLOCK(current->domain); 33.1835 + 33.1836 + return ret; 33.1837 +} 33.1838 + 33.1839 + 33.1840 + 33.1841 +/************************* 33.1842 + * Writable Pagetables 33.1843 + */ 33.1844 + 33.1845 +ptwr_info_t ptwr_info[NR_CPUS]; 33.1846 + 33.1847 +#ifdef VERBOSE 33.1848 +int ptwr_debug = 0x0; 33.1849 +#define PTWR_PRINTK(_f, _a...) \ 33.1850 + do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 ) 33.1851 +#define PTWR_PRINT_WHICH (which ? 'I' : 'A') 33.1852 +#else 33.1853 +#define PTWR_PRINTK(_f, _a...) ((void)0) 33.1854 +#endif 33.1855 + 33.1856 +/* Flush the given writable p.t. page and write-protect it again. */ 33.1857 +void ptwr_flush(const int which) 33.1858 +{ 33.1859 + unsigned long sstat, spte, pte, *ptep, l1va; 33.1860 + l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e; 33.1861 + l2_pgentry_t *pl2e; 33.1862 + int i, cpu = smp_processor_id(); 33.1863 + struct exec_domain *ed = current; 33.1864 + struct domain *d = ed->domain; 33.1865 + 33.1866 + l1va = ptwr_info[cpu].ptinfo[which].l1va; 33.1867 + ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT]; 33.1868 + 33.1869 + /* 33.1870 + * STEP 1. Write-protect the p.t. page so no more updates can occur. 33.1871 + */ 33.1872 + 33.1873 + if ( unlikely(__get_user(pte, ptep)) ) 33.1874 + { 33.1875 + MEM_LOG("ptwr: Could not read pte at %p\n", ptep); 33.1876 + /* 33.1877 + * Really a bug. We could read this PTE during the initial fault, 33.1878 + * and pagetables can't have changed meantime. XXX Multi-CPU guests? 33.1879 + */ 33.1880 + BUG(); 33.1881 + } 33.1882 + PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n", 33.1883 + PTWR_PRINT_WHICH, ptep, pte); 33.1884 + pte &= ~_PAGE_RW; 33.1885 + 33.1886 + if ( unlikely(shadow_mode(d)) ) 33.1887 + { 33.1888 + /* Write-protect the p.t. page in the shadow page table. */ 33.1889 + l1pte_propagate_from_guest(d, &pte, &spte); 33.1890 + __put_user( 33.1891 + spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]); 33.1892 + 33.1893 + /* Is the p.t. page itself shadowed? Map it into Xen space if so. */ 33.1894 + sstat = get_shadow_status(d, pte >> PAGE_SHIFT); 33.1895 + if ( sstat & PSH_shadowed ) 33.1896 + sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT); 33.1897 + } 33.1898 + 33.1899 + /* Write-protect the p.t. page in the guest page table. */ 33.1900 + if ( unlikely(__put_user(pte, ptep)) ) 33.1901 + { 33.1902 + MEM_LOG("ptwr: Could not update pte at %p\n", ptep); 33.1903 + /* 33.1904 + * Really a bug. We could write this PTE during the initial fault, 33.1905 + * and pagetables can't have changed meantime. XXX Multi-CPU guests? 33.1906 + */ 33.1907 + BUG(); 33.1908 + } 33.1909 + 33.1910 + /* Ensure that there are no stale writable mappings in any TLB. */ 33.1911 + /* NB. INVLPG is a serialising instruction: flushes pending updates. */ 33.1912 +#if 1 33.1913 + __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */ 33.1914 +#else 33.1915 + flush_tlb_all(); 33.1916 +#endif 33.1917 + PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n", 33.1918 + PTWR_PRINT_WHICH, ptep, pte); 33.1919 + 33.1920 + /* 33.1921 + * STEP 2. Validate any modified PTEs. 33.1922 + */ 33.1923 + 33.1924 + pl1e = ptwr_info[cpu].ptinfo[which].pl1e; 33.1925 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 33.1926 + { 33.1927 + ol1e = ptwr_info[cpu].ptinfo[which].page[i]; 33.1928 + nl1e = pl1e[i]; 33.1929 + 33.1930 + if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) ) 33.1931 + continue; 33.1932 + 33.1933 + /* 33.1934 + * Fast path for PTEs that have merely been write-protected 33.1935 + * (e.g., during a Unix fork()). A strict reduction in privilege. 33.1936 + */ 33.1937 + if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) ) 33.1938 + { 33.1939 + if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) ) 33.1940 + { 33.1941 + if ( unlikely(sl1e != NULL) ) 33.1942 + l1pte_propagate_from_guest( 33.1943 + d, &l1_pgentry_val(nl1e), 33.1944 + &l1_pgentry_val(sl1e[i])); 33.1945 + put_page_type(&frame_table[l1_pgentry_to_pfn(nl1e)]); 33.1946 + } 33.1947 + continue; 33.1948 + } 33.1949 + 33.1950 + if ( unlikely(!get_page_from_l1e(nl1e, d)) ) 33.1951 + { 33.1952 + MEM_LOG("ptwr: Could not re-validate l1 page\n"); 33.1953 + /* 33.1954 + * Make the remaining p.t's consistent before crashing, so the 33.1955 + * reference counts are correct. 33.1956 + */ 33.1957 + memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i], 33.1958 + (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t)); 33.1959 + unmap_domain_mem(pl1e); 33.1960 + ptwr_info[cpu].ptinfo[which].l1va = 0; 33.1961 + UNLOCK_BIGLOCK(d); 33.1962 + domain_crash(); 33.1963 + } 33.1964 + 33.1965 + if ( unlikely(sl1e != NULL) ) 33.1966 + l1pte_propagate_from_guest( 33.1967 + d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i])); 33.1968 + 33.1969 + if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) ) 33.1970 + put_page_from_l1e(ol1e, d); 33.1971 + } 33.1972 + unmap_domain_mem(pl1e); 33.1973 + 33.1974 + /* 33.1975 + * STEP 3. Reattach the L1 p.t. page into the current address space. 33.1976 + */ 33.1977 + 33.1978 + if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) ) 33.1979 + { 33.1980 + pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx]; 33.1981 + *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 33.1982 + } 33.1983 + 33.1984 + /* 33.1985 + * STEP 4. Final tidy-up. 33.1986 + */ 33.1987 + 33.1988 + ptwr_info[cpu].ptinfo[which].l1va = 0; 33.1989 + 33.1990 + if ( unlikely(sl1e != NULL) ) 33.1991 + { 33.1992 + unmap_domain_mem(sl1e); 33.1993 + put_shadow_status(d); 33.1994 + } 33.1995 +} 33.1996 + 33.1997 +/* Write page fault handler: check if guest is trying to modify a PTE. */ 33.1998 +int ptwr_do_page_fault(unsigned long addr) 33.1999 +{ 33.2000 + unsigned long pte, pfn, l2e; 33.2001 + struct pfn_info *page; 33.2002 + l2_pgentry_t *pl2e; 33.2003 + int which, cpu = smp_processor_id(); 33.2004 + u32 l2_idx; 33.2005 + 33.2006 +#ifdef __x86_64__ 33.2007 + return 0; /* Writable pagetables need fixing for x86_64. */ 33.2008 +#endif 33.2009 + 33.2010 + /* 33.2011 + * Attempt to read the PTE that maps the VA being accessed. By checking for 33.2012 + * PDE validity in the L2 we avoid many expensive fixups in __get_user(). 33.2013 + */ 33.2014 + if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) & 33.2015 + _PAGE_PRESENT) || 33.2016 + __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) ) 33.2017 + { 33.2018 + return 0; 33.2019 + } 33.2020 + 33.2021 + pfn = pte >> PAGE_SHIFT; 33.2022 + page = &frame_table[pfn]; 33.2023 + 33.2024 + /* We are looking only for read-only mappings of p.t. pages. */ 33.2025 + if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) || 33.2026 + ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ) 33.2027 + { 33.2028 + return 0; 33.2029 + } 33.2030 + 33.2031 + /* Get the L2 index at which this L1 p.t. is always mapped. */ 33.2032 + l2_idx = page->u.inuse.type_info & PGT_va_mask; 33.2033 + if ( unlikely(l2_idx >= PGT_va_unknown) ) 33.2034 + { 33.2035 + domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */ 33.2036 + } 33.2037 + l2_idx >>= PGT_va_shift; 33.2038 + 33.2039 + if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) ) 33.2040 + { 33.2041 + MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr); 33.2042 + domain_crash(); 33.2043 + } 33.2044 + 33.2045 + /* 33.2046 + * Is the L1 p.t. mapped into the current address space? If so we call it 33.2047 + * an ACTIVE p.t., otherwise it is INACTIVE. 33.2048 + */ 33.2049 + pl2e = &linear_l2_table[l2_idx]; 33.2050 + l2e = l2_pgentry_val(*pl2e); 33.2051 + which = PTWR_PT_INACTIVE; 33.2052 + if ( (l2e >> PAGE_SHIFT) == pfn ) 33.2053 + { 33.2054 + /* Check the PRESENT bit to set ACTIVE. */ 33.2055 + if ( likely(l2e & _PAGE_PRESENT) ) 33.2056 + which = PTWR_PT_ACTIVE; 33.2057 + else { 33.2058 + /* 33.2059 + * If the PRESENT bit is clear, we may be conflicting with 33.2060 + * the current ACTIVE p.t. (it may be the same p.t. mapped 33.2061 + * at another virt addr). 33.2062 + * The ptwr_flush call below will restore the PRESENT bit. 33.2063 + */ 33.2064 + if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va && 33.2065 + l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx ) 33.2066 + which = PTWR_PT_ACTIVE; 33.2067 + } 33.2068 + } 33.2069 + 33.2070 + PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, " 33.2071 + "pfn %08lx\n", PTWR_PRINT_WHICH, 33.2072 + addr, l2_idx << L2_PAGETABLE_SHIFT, pfn); 33.2073 + 33.2074 + /* 33.2075 + * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 33.2076 + * time. If there is already one, we must flush it out. 33.2077 + */ 33.2078 + if ( ptwr_info[cpu].ptinfo[which].l1va ) 33.2079 + ptwr_flush(which); 33.2080 + 33.2081 + ptwr_info[cpu].ptinfo[which].l1va = addr | 1; 33.2082 + ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx; 33.2083 + 33.2084 + /* For safety, disconnect the L1 p.t. page from current space. */ 33.2085 + if ( (which == PTWR_PT_ACTIVE) && 33.2086 + likely(!shadow_mode(current->domain)) ) 33.2087 + { 33.2088 + *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT); 33.2089 +#if 1 33.2090 + flush_tlb(); /* XXX Multi-CPU guests? */ 33.2091 +#else 33.2092 + flush_tlb_all(); 33.2093 +#endif 33.2094 + } 33.2095 + 33.2096 + /* Temporarily map the L1 page, and make a copy of it. */ 33.2097 + ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT); 33.2098 + memcpy(ptwr_info[cpu].ptinfo[which].page, 33.2099 + ptwr_info[cpu].ptinfo[which].pl1e, 33.2100 + ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t)); 33.2101 + 33.2102 + /* Finally, make the p.t. page writable by the guest OS. */ 33.2103 + pte |= _PAGE_RW; 33.2104 + PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH, 33.2105 + &linear_pg_table[addr>>PAGE_SHIFT], pte); 33.2106 + if ( unlikely(__put_user(pte, (unsigned long *) 33.2107 + &linear_pg_table[addr>>PAGE_SHIFT])) ) 33.2108 + { 33.2109 + MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *) 33.2110 + &linear_pg_table[addr>>PAGE_SHIFT]); 33.2111 + /* Toss the writable pagetable state and crash. */ 33.2112 + unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e); 33.2113 + ptwr_info[cpu].ptinfo[which].l1va = 0; 33.2114 + domain_crash(); 33.2115 + } 33.2116 + 33.2117 + return EXCRET_fault_fixed; 33.2118 +} 33.2119 + 33.2120 +static __init int ptwr_init(void) 33.2121 +{ 33.2122 + int i; 33.2123 + 33.2124 + for ( i = 0; i < smp_num_cpus; i++ ) 33.2125 + { 33.2126 + ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page = 33.2127 + (void *)alloc_xenheap_page(); 33.2128 + ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page = 33.2129 + (void *)alloc_xenheap_page(); 33.2130 + } 33.2131 + 33.2132 + return 0; 33.2133 +} 33.2134 +__initcall(ptwr_init); 33.2135 + 33.2136 + 33.2137 + 33.2138 + 33.2139 +/************************************************************************/ 33.2140 +/************************************************************************/ 33.2141 +/************************************************************************/ 33.2142 + 33.2143 +#ifndef NDEBUG 33.2144 + 33.2145 +void ptwr_status(void) 33.2146 +{ 33.2147 + unsigned long pte, *ptep, pfn; 33.2148 + struct pfn_info *page; 33.2149 + int cpu = smp_processor_id(); 33.2150 + 33.2151 + ptep = (unsigned long *)&linear_pg_table 33.2152 + [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT]; 33.2153 + 33.2154 + if ( __get_user(pte, ptep) ) { 33.2155 + MEM_LOG("ptwr: Could not read pte at %p\n", ptep); 33.2156 + domain_crash(); 33.2157 + } 33.2158 + 33.2159 + pfn = pte >> PAGE_SHIFT; 33.2160 + page = &frame_table[pfn]; 33.2161 + printk("need to alloc l1 page %p\n", page); 33.2162 + /* make pt page writable */ 33.2163 + printk("need to make read-only l1-page at %p is %08lx\n", 33.2164 + ptep, pte); 33.2165 + 33.2166 + if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 ) 33.2167 + return; 33.2168 + 33.2169 + if ( __get_user(pte, (unsigned long *) 33.2170 + ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) { 33.2171 + MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *) 33.2172 + ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va); 33.2173 + domain_crash(); 33.2174 + } 33.2175 + pfn = pte >> PAGE_SHIFT; 33.2176 + page = &frame_table[pfn]; 33.2177 +} 33.2178 + 33.2179 +void audit_domain(struct domain *d) 33.2180 +{ 33.2181 + int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; 33.2182 + 33.2183 + void adjust (struct pfn_info *page, int dir, int adjtype) 33.2184 + { 33.2185 + int count = page->count_info & PGC_count_mask; 33.2186 + 33.2187 + if ( adjtype ) 33.2188 + { 33.2189 + int tcount = page->u.inuse.type_info & PGT_count_mask; 33.2190 + 33.2191 + ttot++; 33.2192 + 33.2193 + tcount += dir; 33.2194 + 33.2195 + if ( tcount < 0 ) 33.2196 + { 33.2197 + /* This will only come out once. */ 33.2198 + printk("Audit %d: type count whent below zero pfn=%x " 33.2199 + "taf=%x otaf=%x\n", 33.2200 + d->id, page-frame_table, 33.2201 + page->u.inuse.type_info, 33.2202 + page->tlbflush_timestamp); 33.2203 + } 33.2204 + 33.2205 + page->u.inuse.type_info = 33.2206 + (page->u.inuse.type_info & ~PGT_count_mask) | 33.2207 + (tcount & PGT_count_mask); 33.2208 + } 33.2209 + 33.2210 + ctot++; 33.2211 + count += dir; 33.2212 + if ( count < 0 ) 33.2213 + { 33.2214 + /* This will only come out once. */ 33.2215 + printk("Audit %d: general count whent below zero pfn=%x " 33.2216 + "taf=%x otaf=%x\n", 33.2217 + d->id, page-frame_table, 33.2218 + page->u.inuse.type_info, 33.2219 + page->tlbflush_timestamp); 33.2220 + } 33.2221 + 33.2222 + page->count_info = 33.2223 + (page->count_info & ~PGC_count_mask) | 33.2224 + (count & PGC_count_mask); 33.2225 + 33.2226 + } 33.2227 + 33.2228 + void scan_for_pfn(struct domain *d, unsigned long xpfn) 33.2229 + { 33.2230 + unsigned long pfn, *pt; 33.2231 + struct list_head *list_ent; 33.2232 + struct pfn_info *page; 33.2233 + int i; 33.2234 + 33.2235 + list_ent = d->page_list.next; 33.2236 + for ( i = 0; (list_ent != &d->page_list); i++ ) 33.2237 + { 33.2238 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 33.2239 + page = &frame_table[pfn]; 33.2240 + 33.2241 + switch ( page->u.inuse.type_info & PGT_type_mask ) 33.2242 + { 33.2243 + case PGT_l1_page_table: 33.2244 + case PGT_l2_page_table: 33.2245 + pt = map_domain_mem(pfn<<PAGE_SHIFT); 33.2246 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 33.2247 + if ( (pt[i] & _PAGE_PRESENT) && 33.2248 + ((pt[i] >> PAGE_SHIFT) == xpfn) ) 33.2249 + printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n", 33.2250 + d->id, i, pfn, page->u.inuse.type_info, 33.2251 + page->count_info); 33.2252 + unmap_domain_mem(pt); 33.2253 + } 33.2254 + 33.2255 + list_ent = frame_table[pfn].list.next; 33.2256 + } 33.2257 + 33.2258 + } 33.2259 + 33.2260 + void scan_for_pfn_remote(unsigned long xpfn) 33.2261 + { 33.2262 + struct domain *e; 33.2263 + for_each_domain ( e ) 33.2264 + scan_for_pfn( e, xpfn ); 33.2265 + } 33.2266 + 33.2267 + int i; 33.2268 + unsigned long pfn; 33.2269 + struct list_head *list_ent; 33.2270 + struct pfn_info *page; 33.2271 + 33.2272 + if ( d != current->domain ) 33.2273 + domain_pause(d); 33.2274 + synchronise_pagetables(~0UL); 33.2275 + 33.2276 + printk("pt base=%lx sh_info=%x\n", 33.2277 + pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT, 33.2278 + virt_to_page(d->shared_info)-frame_table); 33.2279 + 33.2280 + spin_lock(&d->page_alloc_lock); 33.2281 + 33.2282 + /* PHASE 0 */ 33.2283 + 33.2284 + list_ent = d->page_list.next; 33.2285 + for ( i = 0; (list_ent != &d->page_list); i++ ) 33.2286 + { 33.2287 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 33.2288 + page = &frame_table[pfn]; 33.2289 + 33.2290 + if ( page_get_owner(page) != d ) 33.2291 + BUG(); 33.2292 + 33.2293 + if ( (page->u.inuse.type_info & PGT_count_mask) > 33.2294 + (page->count_info & PGC_count_mask) ) 33.2295 + printk("taf > caf %x %x pfn=%lx\n", 33.2296 + page->u.inuse.type_info, page->count_info, pfn ); 33.2297 + 33.2298 +#if 0 /* SYSV shared memory pages plus writeable files. */ 33.2299 + if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 33.2300 + (page->u.inuse.type_info & PGT_count_mask) > 1 ) 33.2301 + { 33.2302 + printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n", 33.2303 + pfn, 33.2304 + page->u.inuse.type_info, 33.2305 + page->count_info ); 33.2306 + scan_for_pfn_remote(pfn); 33.2307 + } 33.2308 +#endif 33.2309 + if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 33.2310 + (page->u.inuse.type_info & PGT_count_mask) > 1 ) 33.2311 + { 33.2312 + printk("normal page with type count >1: pfn=%lx t=%x c=%x\n", 33.2313 + pfn, 33.2314 + page->u.inuse.type_info, 33.2315 + page->count_info ); 33.2316 + } 33.2317 + 33.2318 + /* Use tlbflush_timestamp to store original type_info. */ 33.2319 + page->tlbflush_timestamp = page->u.inuse.type_info; 33.2320 + 33.2321 + list_ent = frame_table[pfn].list.next; 33.2322 + } 33.2323 + 33.2324 + 33.2325 + /* PHASE 1 */ 33.2326 + 33.2327 + adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1); 33.2328 + 33.2329 + list_ent = d->page_list.next; 33.2330 + for ( i = 0; (list_ent != &d->page_list); i++ ) 33.2331 + { 33.2332 + unsigned long *pt; 33.2333 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 33.2334 + page = &frame_table[pfn]; 33.2335 + 33.2336 + if ( page_get_owner(page) != d ) 33.2337 + BUG(); 33.2338 + 33.2339 + switch ( page->u.inuse.type_info & PGT_type_mask ) 33.2340 + { 33.2341 + case PGT_l2_page_table: 33.2342 + 33.2343 + if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) 33.2344 + printk("Audit %d: L2 not validated %x\n", 33.2345 + d->id, page->u.inuse.type_info); 33.2346 + 33.2347 + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) 33.2348 + printk("Audit %d: L2 not pinned %x\n", 33.2349 + d->id, page->u.inuse.type_info); 33.2350 + else 33.2351 + adjust( page, -1, 1 ); 33.2352 + 33.2353 + pt = map_domain_mem( pfn<<PAGE_SHIFT ); 33.2354 + 33.2355 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 33.2356 + { 33.2357 + if ( pt[i] & _PAGE_PRESENT ) 33.2358 + { 33.2359 + unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 33.2360 + struct pfn_info *l1page = &frame_table[l1pfn]; 33.2361 + 33.2362 + if ( page_get_owner(l1page) != d ) 33.2363 + { 33.2364 + printk("L2: Skip bizarre page belonging to other " 33.2365 + "dom %p\n", page_get_owner(l1page)); 33.2366 + continue; 33.2367 + } 33.2368 + 33.2369 + if ( (l1page->u.inuse.type_info & PGT_type_mask) == 33.2370 + PGT_l2_page_table ) 33.2371 + printk("Audit %d: [%x] Found %s Linear PT " 33.2372 + "t=%x pfn=%lx\n", d->id, i, 33.2373 + (l1pfn==pfn) ? "Self" : "Other", 33.2374 + l1page->u.inuse.type_info, 33.2375 + l1pfn); 33.2376 + else if ( (l1page->u.inuse.type_info & PGT_type_mask) != 33.2377 + PGT_l1_page_table ) 33.2378 + printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n", 33.2379 + d->id, i, 33.2380 + l1page->u.inuse.type_info, 33.2381 + l1pfn); 33.2382 + 33.2383 + adjust(l1page, -1, 1); 33.2384 + } 33.2385 + } 33.2386 + 33.2387 + unmap_domain_mem(pt); 33.2388 + 33.2389 + break; 33.2390 + 33.2391 + 33.2392 + case PGT_l1_page_table: 33.2393 + 33.2394 + if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 33.2395 + adjust( page, -1, 1 ); 33.2396 + 33.2397 + if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) 33.2398 + printk("Audit %d: L1 not validated %x\n", 33.2399 + d->id, page->u.inuse.type_info); 33.2400 +#if 0 33.2401 + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) 33.2402 + printk("Audit %d: L1 not pinned %x\n", 33.2403 + d->id, page->u.inuse.type_info); 33.2404 +#endif 33.2405 + pt = map_domain_mem( pfn<<PAGE_SHIFT ); 33.2406 + 33.2407 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 33.2408 + { 33.2409 + if ( pt[i] & _PAGE_PRESENT ) 33.2410 + { 33.2411 + unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 33.2412 + struct pfn_info *l1page = &frame_table[l1pfn]; 33.2413 + 33.2414 + if ( l1pfn < 0x100 ) 33.2415 + { 33.2416 + lowmem_mappings++; 33.2417 + continue; 33.2418 + } 33.2419 + 33.2420 + if ( l1pfn > max_page ) 33.2421 + { 33.2422 + io_mappings++; 33.2423 + continue; 33.2424 + } 33.2425 + 33.2426 + if ( pt[i] & _PAGE_RW ) 33.2427 + { 33.2428 + 33.2429 + if ( (l1page->u.inuse.type_info & PGT_type_mask) == 33.2430 + PGT_l1_page_table || 33.2431 + (l1page->u.inuse.type_info & PGT_type_mask) == 33.2432 + PGT_l2_page_table ) 33.2433 + printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n", 33.2434 + d->id, i, 33.2435 + l1page->u.inuse.type_info, 33.2436 + l1pfn); 33.2437 + 33.2438 + } 33.2439 + 33.2440 + if ( page_get_owner(l1page) != d ) 33.2441 + { 33.2442 + printk("Audit %d: [%lx,%x] Skip foreign page dom=%p " 33.2443 + "pfn=%lx c=%08x t=%08x m2p=%lx\n", 33.2444 + d->id, pfn, i, 33.2445 + page_get_owner(l1page), 33.2446 + l1pfn, 33.2447 + l1page->count_info, 33.2448 + l1page->u.inuse.type_info, 33.2449 + machine_to_phys_mapping[l1pfn]); 33.2450 + continue; 33.2451 + } 33.2452 + 33.2453 + adjust(l1page, -1, 0); 33.2454 + } 33.2455 + } 33.2456 + 33.2457 + unmap_domain_mem(pt); 33.2458 + 33.2459 + break; 33.2460 + } 33.2461 + 33.2462 + list_ent = frame_table[pfn].list.next; 33.2463 + } 33.2464 + 33.2465 + if ( (io_mappings > 0) || (lowmem_mappings > 0) ) 33.2466 + printk("Audit %d: Found %d lowmem mappings and %d io mappings\n", 33.2467 + d->id, lowmem_mappings, io_mappings); 33.2468 + 33.2469 + /* PHASE 2 */ 33.2470 + 33.2471 + ctot = ttot = 0; 33.2472 + list_ent = d->page_list.next; 33.2473 + for ( i = 0; (list_ent != &d->page_list); i++ ) 33.2474 + { 33.2475 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 33.2476 + page = &frame_table[pfn]; 33.2477 + 33.2478 + switch ( page->u.inuse.type_info & PGT_type_mask) 33.2479 + { 33.2480 + case PGT_l1_page_table: 33.2481 + case PGT_l2_page_table: 33.2482 + if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) 33.2483 + { 33.2484 + printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n", 33.2485 + d->id, page->u.inuse.type_info, 33.2486 + page->tlbflush_timestamp, 33.2487 + page->count_info, pfn ); 33.2488 + scan_for_pfn_remote(pfn); 33.2489 + } 33.2490 + default: 33.2491 + if ( (page->count_info & PGC_count_mask) != 1 ) 33.2492 + { 33.2493 + printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n", 33.2494 + d->id, 33.2495 + page->count_info, 33.2496 + page->u.inuse.type_info, 33.2497 + page->tlbflush_timestamp, pfn ); 33.2498 + scan_for_pfn_remote(pfn); 33.2499 + } 33.2500 + break; 33.2501 + } 33.2502 + 33.2503 + list_ent = frame_table[pfn].list.next; 33.2504 + } 33.2505 + 33.2506 + /* PHASE 3 */ 33.2507 + list_ent = d->page_list.next; 33.2508 + for ( i = 0; (list_ent != &d->page_list); i++ ) 33.2509 + { 33.2510 + unsigned long *pt; 33.2511 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 33.2512 + page = &frame_table[pfn]; 33.2513 + 33.2514 + switch ( page->u.inuse.type_info & PGT_type_mask ) 33.2515 + { 33.2516 + case PGT_l2_page_table: 33.2517 + if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 33.2518 + adjust( page, 1, 1 ); 33.2519 + 33.2520 + pt = map_domain_mem( pfn<<PAGE_SHIFT ); 33.2521 + 33.2522 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 33.2523 + { 33.2524 + if ( pt[i] & _PAGE_PRESENT ) 33.2525 + { 33.2526 + unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 33.2527 + struct pfn_info *l1page; 33.2528 + 33.2529 + if (l1pfn>max_page) 33.2530 + continue; 33.2531 + 33.2532 + l1page = &frame_table[l1pfn]; 33.2533 + 33.2534 + if ( page_get_owner(l1page) == d ) 33.2535 + adjust(l1page, 1, 1); 33.2536 + } 33.2537 + } 33.2538 + 33.2539 + unmap_domain_mem(pt); 33.2540 + break; 33.2541 + 33.2542 + case PGT_l1_page_table: 33.2543 + if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 33.2544 + adjust( page, 1, 1 ); 33.2545 + 33.2546 + pt = map_domain_mem( pfn<<PAGE_SHIFT ); 33.2547 + 33.2548 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 33.2549 + { 33.2550 + if ( pt[i] & _PAGE_PRESENT ) 33.2551 + { 33.2552 + unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 33.2553 + struct pfn_info *l1page; 33.2554 + 33.2555 + if (l1pfn>max_page) 33.2556 + continue; 33.2557 + 33.2558 + l1page = &frame_table[l1pfn]; 33.2559 + 33.2560 + if ( (page_get_owner(l1page) != d) || 33.2561 + (l1pfn < 0x100) || (l1pfn > max_page) ) 33.2562 + continue; 33.2563 + 33.2564 + adjust(l1page, 1, 0); 33.2565 + } 33.2566 + } 33.2567 + 33.2568 + unmap_domain_mem(pt); 33.2569 + break; 33.2570 + } 33.2571 + 33.2572 + 33.2573 + page->tlbflush_timestamp = 0; 33.2574 + 33.2575 + list_ent = frame_table[pfn].list.next; 33.2576 + } 33.2577 + 33.2578 + spin_unlock(&d->page_alloc_lock); 33.2579 + 33.2580 + adjust(&frame_table[pagetable_val( 33.2581 + d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1); 33.2582 + 33.2583 + printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot ); 33.2584 + 33.2585 + if ( d != current->domain ) 33.2586 + domain_unpause(d); 33.2587 +} 33.2588 + 33.2589 +void audit_domains(void) 33.2590 +{ 33.2591 + struct domain *d; 33.2592 + for_each_domain ( d ) 33.2593 + audit_domain(d); 33.2594 +} 33.2595 + 33.2596 +void audit_domains_key(unsigned char key) 33.2597 +{ 33.2598 + audit_domains(); 33.2599 +} 33.2600 + 33.2601 +#endif
34.1 --- a/xen/arch/x86/setup.c Mon Feb 07 08:19:24 2005 +0000 34.2 +++ b/xen/arch/x86/setup.c Tue Feb 08 16:44:16 2005 +0000 34.3 @@ -298,19 +298,21 @@ void __init identify_cpu(struct cpuinfo_ 34.4 unsigned long cpu_initialized; 34.5 void __init cpu_init(void) 34.6 { 34.7 - extern void percpu_traps_init(void); 34.8 int nr = smp_processor_id(); 34.9 struct tss_struct *t = &init_tss[nr]; 34.10 + unsigned char idt_load[10]; 34.11 34.12 if ( test_and_set_bit(nr, &cpu_initialized) ) 34.13 panic("CPU#%d already initialized!!!\n", nr); 34.14 printk("Initializing CPU#%d\n", nr); 34.15 34.16 - /* Set up GDT and IDT. */ 34.17 SET_GDT_ENTRIES(current, DEFAULT_GDT_ENTRIES); 34.18 SET_GDT_ADDRESS(current, DEFAULT_GDT_ADDRESS); 34.19 __asm__ __volatile__ ( "lgdt %0" : "=m" (*current->arch.gdt) ); 34.20 - __asm__ __volatile__ ( "lidt %0" : "=m" (idt_descr) ); 34.21 + 34.22 + *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1; 34.23 + *(unsigned long *)(&idt_load[2]) = (unsigned long)idt_tables[nr]; 34.24 + __asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) ); 34.25 34.26 /* No nested task. */ 34.27 __asm__ __volatile__ ( "pushf ; andw $0xbfff,(%"__OP"sp) ; popf" ); 34.28 @@ -336,8 +338,6 @@ void __init cpu_init(void) 34.29 CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); 34.30 #undef CD 34.31 34.32 - percpu_traps_init(); 34.33 - 34.34 /* Install correct page table. */ 34.35 write_ptbase(current); 34.36
35.1 --- a/xen/arch/x86/shadow.c Mon Feb 07 08:19:24 2005 +0000 35.2 +++ b/xen/arch/x86/shadow.c Tue Feb 08 16:44:16 2005 +0000 35.3 @@ -73,11 +73,11 @@ static void free_shadow_state(struct dom 35.4 35.5 /* Free the head page. */ 35.6 free_shadow_page( 35.7 - d, &frame_table[x->spfn_and_flags & PSH_pfn_mask]); 35.8 + d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]); 35.9 35.10 /* Reinitialise the head node. */ 35.11 x->pfn = 0; 35.12 - x->spfn_and_flags = 0; 35.13 + x->smfn_and_flags = 0; 35.14 n = x->next; 35.15 x->next = NULL; 35.16 35.17 @@ -88,11 +88,11 @@ static void free_shadow_state(struct dom 35.18 { 35.19 /* Free the shadow page. */ 35.20 free_shadow_page( 35.21 - d, &frame_table[x->spfn_and_flags & PSH_pfn_mask]); 35.22 + d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]); 35.23 35.24 /* Re-initialise the chain node. */ 35.25 x->pfn = 0; 35.26 - x->spfn_and_flags = 0; 35.27 + x->smfn_and_flags = 0; 35.28 35.29 /* Add to the free list. */ 35.30 n = x->next; 35.31 @@ -113,14 +113,14 @@ static inline int clear_shadow_page( 35.32 { 35.33 unsigned long *p; 35.34 int restart = 0; 35.35 - struct pfn_info *spage = &frame_table[x->spfn_and_flags & PSH_pfn_mask]; 35.36 + struct pfn_info *spage = &frame_table[x->smfn_and_flags & PSH_pfn_mask]; 35.37 35.38 switch ( spage->u.inuse.type_info & PGT_type_mask ) 35.39 { 35.40 /* We clear L2 pages by zeroing the guest entries. */ 35.41 case PGT_l2_page_table: 35.42 p = map_domain_mem((spage - frame_table) << PAGE_SHIFT); 35.43 - if (d->arch.shadow_mode == SHM_full_32) 35.44 + if ( shadow_mode(d) == SHM_full_32 ) 35.45 memset(p, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(*p)); 35.46 else 35.47 memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p)); 35.48 @@ -419,7 +419,7 @@ static inline struct pfn_info *alloc_sha 35.49 35.50 void unshadow_table(unsigned long gpfn, unsigned int type) 35.51 { 35.52 - unsigned long spfn; 35.53 + unsigned long smfn; 35.54 struct domain *d = page_get_owner(&frame_table[gpfn]); 35.55 35.56 SH_VLOG("unshadow_table type=%08x gpfn=%08lx", type, gpfn); 35.57 @@ -431,15 +431,15 @@ void unshadow_table(unsigned long gpfn, 35.58 * guests there won't be a race here as this CPU was the one that 35.59 * cmpxchg'ed the page to invalid. 35.60 */ 35.61 - spfn = __shadow_status(d, gpfn) & PSH_pfn_mask; 35.62 + smfn = __shadow_status(d, gpfn) & PSH_pfn_mask; 35.63 delete_shadow_status(d, gpfn); 35.64 - free_shadow_page(d, &frame_table[spfn]); 35.65 + free_shadow_page(d, &frame_table[smfn]); 35.66 } 35.67 35.68 #ifdef CONFIG_VMX 35.69 void vmx_shadow_clear_state(struct domain *d) 35.70 { 35.71 - SH_VVLOG("vmx_clear_shadow_state: \n"); 35.72 + SH_VVLOG("vmx_clear_shadow_state:"); 35.73 clear_shadow_state(d); 35.74 } 35.75 #endif 35.76 @@ -453,7 +453,7 @@ unsigned long shadow_l2_table( 35.77 l2_pgentry_t *spl2e = 0; 35.78 unsigned long guest_gpfn; 35.79 35.80 - __get_machine_to_phys(d, guest_gpfn, gpfn); 35.81 + guest_gpfn = __mfn_to_gpfn(d, gpfn); 35.82 35.83 SH_VVLOG("shadow_l2_table( %08lx )", gpfn); 35.84 35.85 @@ -471,9 +471,13 @@ unsigned long shadow_l2_table( 35.86 35.87 #ifdef __i386__ 35.88 /* Install hypervisor and 2x linear p.t. mapings. */ 35.89 - if ( d->arch.shadow_mode == SHM_full_32 ) 35.90 + if ( shadow_mode(d) == SHM_full_32 ) 35.91 { 35.92 +#ifdef CONFIG_VMX 35.93 vmx_update_shadow_state(d->exec_domain[0], gpfn, spfn); 35.94 +#else 35.95 + panic("Shadow Full 32 not yet implemented without VMX\n"); 35.96 +#endif 35.97 } 35.98 else 35.99 { 35.100 @@ -499,7 +503,7 @@ unsigned long shadow_l2_table( 35.101 } 35.102 #endif 35.103 35.104 - if ( d->arch.shadow_mode != SHM_full_32 ) 35.105 + if ( shadow_mode(d) != SHM_full_32 ) 35.106 unmap_domain_mem(spl2e); 35.107 35.108 SH_VLOG("shadow_l2_table( %08lx -> %08lx)", gpfn, spfn); 35.109 @@ -510,13 +514,13 @@ static void shadow_map_l1_into_current_l 35.110 { 35.111 struct exec_domain *ed = current; 35.112 struct domain *d = ed->domain; 35.113 - unsigned long *gpl1e, *spl1e, gpl2e, spl2e, gl1pfn, sl1pfn=0, sl1ss; 35.114 + unsigned long *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, sl1pfn=0, sl1ss; 35.115 struct pfn_info *sl1pfn_info; 35.116 int i; 35.117 35.118 - __guest_get_pl2e(ed, va, &gpl2e); 35.119 + __guest_get_l2e(ed, va, &gl2e); 35.120 35.121 - gl1pfn = gpl2e >> PAGE_SHIFT; 35.122 + gl1pfn = gl2e >> PAGE_SHIFT; 35.123 35.124 sl1ss = __shadow_status(d, gl1pfn); 35.125 if ( !(sl1ss & PSH_shadowed) ) 35.126 @@ -534,10 +538,10 @@ static void shadow_map_l1_into_current_l 35.127 35.128 set_shadow_status(d, gl1pfn, PSH_shadowed | sl1pfn); 35.129 35.130 - l2pde_general(d, &gpl2e, &spl2e, sl1pfn); 35.131 + l2pde_general(d, &gl2e, &sl2e, sl1pfn); 35.132 35.133 - __guest_set_pl2e(ed, va, gpl2e); 35.134 - __shadow_set_pl2e(ed, va, spl2e); 35.135 + __guest_set_l2e(ed, va, gl2e); 35.136 + __shadow_set_l2e(ed, va, sl2e); 35.137 35.138 gpl1e = (unsigned long *) &(linear_pg_table[ 35.139 (va>>L1_PAGETABLE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1)]); 35.140 @@ -554,9 +558,9 @@ static void shadow_map_l1_into_current_l 35.141 SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )", sl1pfn); 35.142 35.143 sl1pfn = sl1ss & PSH_pfn_mask; 35.144 - l2pde_general(d, &gpl2e, &spl2e, sl1pfn); 35.145 - __guest_set_pl2e(ed, va, gpl2e); 35.146 - __shadow_set_pl2e(ed, va, spl2e); 35.147 + l2pde_general(d, &gl2e, &sl2e, sl1pfn); 35.148 + __guest_set_l2e(ed, va, gl2e); 35.149 + __shadow_set_l2e(ed, va, sl2e); 35.150 } 35.151 } 35.152 35.153 @@ -576,7 +580,7 @@ void vmx_shadow_invlpg(struct domain *d, 35.154 return; 35.155 } 35.156 35.157 - host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT]; 35.158 + host_pfn = phys_to_machine_mapping(gpte >> PAGE_SHIFT); 35.159 spte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); 35.160 35.161 if (__put_user(spte, (unsigned long *) 35.162 @@ -588,7 +592,7 @@ void vmx_shadow_invlpg(struct domain *d, 35.163 35.164 int shadow_fault(unsigned long va, long error_code) 35.165 { 35.166 - unsigned long gpte, spte; 35.167 + unsigned long gpte, spte = 0; 35.168 struct exec_domain *ed = current; 35.169 struct domain *d = ed->domain; 35.170 35.171 @@ -628,14 +632,14 @@ int shadow_fault(unsigned long va, long 35.172 if ( unlikely(__get_user(gpte, (unsigned long *) 35.173 &linear_pg_table[va >> PAGE_SHIFT])) ) 35.174 { 35.175 - SH_VVLOG("shadow_fault - EXIT: read gpte faulted" ); 35.176 + SH_VVLOG("shadow_fault - EXIT: read gpte faulted2" ); 35.177 shadow_unlock(d); 35.178 return 0; 35.179 } 35.180 35.181 if ( unlikely(!(gpte & _PAGE_PRESENT)) ) 35.182 { 35.183 - SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte ); 35.184 + SH_VVLOG("shadow_fault - EXIT: gpte not present2 (%lx)",gpte ); 35.185 shadow_unlock(d); 35.186 return 0; 35.187 } 35.188 @@ -691,20 +695,20 @@ int shadow_fault(unsigned long va, long 35.189 35.190 void shadow_l1_normal_pt_update( 35.191 unsigned long pa, unsigned long gpte, 35.192 - unsigned long *prev_spfn_ptr, 35.193 + unsigned long *prev_smfn_ptr, 35.194 l1_pgentry_t **prev_spl1e_ptr) 35.195 { 35.196 - unsigned long spfn, spte, prev_spfn = *prev_spfn_ptr; 35.197 + unsigned long smfn, spte, prev_smfn = *prev_smfn_ptr; 35.198 l1_pgentry_t *spl1e, *prev_spl1e = *prev_spl1e_ptr; 35.199 35.200 /* N.B. To get here, we know the l1 page *must* be shadowed. */ 35.201 SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, " 35.202 - "prev_spfn=%08lx, prev_spl1e=%p\n", 35.203 - pa, gpte, prev_spfn, prev_spl1e); 35.204 + "prev_smfn=%08lx, prev_spl1e=%p", 35.205 + pa, gpte, prev_smfn, prev_spl1e); 35.206 35.207 - spfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask; 35.208 + smfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask; 35.209 35.210 - if ( spfn == prev_spfn ) 35.211 + if ( smfn == prev_smfn ) 35.212 { 35.213 spl1e = prev_spl1e; 35.214 } 35.215 @@ -712,8 +716,8 @@ void shadow_l1_normal_pt_update( 35.216 { 35.217 if ( prev_spl1e != NULL ) 35.218 unmap_domain_mem( prev_spl1e ); 35.219 - spl1e = (l1_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT); 35.220 - *prev_spfn_ptr = spfn; 35.221 + spl1e = (l1_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT); 35.222 + *prev_smfn_ptr = smfn; 35.223 *prev_spl1e_ptr = spl1e; 35.224 } 35.225 35.226 @@ -721,24 +725,24 @@ void shadow_l1_normal_pt_update( 35.227 spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = mk_l1_pgentry(spte); 35.228 } 35.229 35.230 -void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpte) 35.231 +void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde) 35.232 { 35.233 - unsigned long spfn, spte; 35.234 + unsigned long sl2mfn, spde; 35.235 l2_pgentry_t *spl2e; 35.236 - unsigned long s_sh; 35.237 + unsigned long sl1mfn; 35.238 35.239 /* N.B. To get here, we know the l2 page *must* be shadowed. */ 35.240 - SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte); 35.241 + SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpde=%08lx",pa,gpde); 35.242 35.243 - spfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask; 35.244 + sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask; 35.245 35.246 - s_sh = (gpte & _PAGE_PRESENT) ? 35.247 - __shadow_status(current->domain, gpte >> PAGE_SHIFT) : 0; 35.248 + sl1mfn = (gpde & _PAGE_PRESENT) ? 35.249 + __shadow_status(current->domain, gpde >> PAGE_SHIFT) : 0; 35.250 35.251 /* XXXX Should mark guest pte as DIRTY and ACCESSED too! */ 35.252 - l2pde_general(current->domain, &gpte, &spte, s_sh); 35.253 - spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT); 35.254 - spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spte); 35.255 + l2pde_general(current->domain, &gpde, &spde, sl1mfn); 35.256 + spl2e = (l2_pgentry_t *)map_domain_mem(sl2mfn << PAGE_SHIFT); 35.257 + spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spde); 35.258 unmap_domain_mem(spl2e); 35.259 } 35.260 35.261 @@ -751,23 +755,36 @@ void shadow_l2_normal_pt_update(unsigned 35.262 35.263 #if SHADOW_DEBUG 35.264 35.265 +// BUG: these are not SMP safe... 35.266 static int sh_l2_present; 35.267 static int sh_l1_present; 35.268 +static int errors; 35.269 char * sh_check_name; 35.270 35.271 -#define FAIL(_f, _a...) \ 35.272 - do { \ 35.273 - printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n", \ 35.274 - sh_check_name, level, i, ## _a , gpte, spte); \ 35.275 - BUG(); \ 35.276 +#define virt_to_phys2(adr) ({ \ 35.277 + unsigned long _a = (unsigned long)(adr); \ 35.278 + unsigned long _pte = l1_pgentry_val( \ 35.279 + shadow_linear_pg_table[_a >> PAGE_SHIFT]); \ 35.280 + unsigned long _pa = _pte & PAGE_MASK; \ 35.281 + _pa | (_a & ~PAGE_MASK); \ 35.282 +}) 35.283 + 35.284 +#define FAIL(_f, _a...) \ 35.285 + do { \ 35.286 + printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx &g=%08lx &s=%08lx" \ 35.287 + " pa(&g)=%08lx pa(&s)=%08lx\n", \ 35.288 + sh_check_name, level, i, ## _a , gpte, spte, pgpte, pspte, \ 35.289 + virt_to_phys2(pgpte), virt_to_phys2(pspte)); \ 35.290 + errors++; \ 35.291 } while ( 0 ) 35.292 35.293 static int check_pte( 35.294 - struct domain *d, unsigned long gpte, unsigned long spte, 35.295 + struct domain *d, unsigned long *pgpte, unsigned long *pspte, 35.296 int level, int i) 35.297 { 35.298 - unsigned long mask, gpfn, spfn; 35.299 - unsigned long guest_gpfn; 35.300 + unsigned gpte = *pgpte; 35.301 + unsigned spte = *pspte; 35.302 + unsigned long mask, gpfn, smfn; 35.303 35.304 if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) ) 35.305 return 1; /* always safe */ 35.306 @@ -781,7 +798,7 @@ static int check_pte( 35.307 if ( !(gpte & _PAGE_PRESENT) ) 35.308 FAIL("Guest not present yet shadow is"); 35.309 35.310 - mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|0xFFFFF000); 35.311 + mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|PAGE_MASK); 35.312 35.313 if ( (spte & mask) != (gpte & mask) ) 35.314 FAIL("Corrupt?"); 35.315 @@ -798,10 +815,10 @@ static int check_pte( 35.316 if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) ) 35.317 FAIL("RW2 coherence"); 35.318 35.319 - spfn = spte >> PAGE_SHIFT; 35.320 + smfn = spte >> PAGE_SHIFT; 35.321 gpfn = gpte >> PAGE_SHIFT; 35.322 35.323 - if ( gpfn == spfn ) 35.324 + if ( gpfn == smfn ) 35.325 { 35.326 if ( level > 1 ) 35.327 FAIL("Linear map ???"); /* XXX this will fail on BSD */ 35.328 @@ -811,20 +828,9 @@ static int check_pte( 35.329 if ( level < 2 ) 35.330 FAIL("Shadow in L1 entry?"); 35.331 35.332 - if (d->arch.shadow_mode == SHM_full_32) { 35.333 - 35.334 - guest_gpfn = phys_to_machine_mapping[gpfn]; 35.335 - 35.336 - if ( __shadow_status(d, guest_gpfn) != (PSH_shadowed | spfn) ) 35.337 - FAIL("spfn problem g.sf=%08lx", 35.338 - __shadow_status(d, guest_gpfn) ); 35.339 - 35.340 - } else { 35.341 - if ( __shadow_status(d, gpfn) != (PSH_shadowed | spfn) ) 35.342 - FAIL("spfn problem g.sf=%08lx", 35.343 - __shadow_status(d, gpfn) ); 35.344 - } 35.345 - 35.346 + if ( __shadow_status(d, gpfn) != (PSH_shadowed | smfn) ) 35.347 + FAIL("smfn problem g.sf=%08lx", 35.348 + __shadow_status(d, gpfn) ); 35.349 } 35.350 35.351 return 1; 35.352 @@ -832,17 +838,17 @@ static int check_pte( 35.353 35.354 35.355 static int check_l1_table( 35.356 - struct domain *d, unsigned long va, 35.357 - unsigned long g2, unsigned long s2) 35.358 + struct domain *d, 35.359 + unsigned long g2mfn, unsigned long s2mfn) 35.360 { 35.361 int i; 35.362 unsigned long *gpl1e, *spl1e; 35.363 35.364 - gpl1e = map_domain_mem(g2 << PAGE_SHIFT); 35.365 - spl1e = map_domain_mem(s2 << PAGE_SHIFT); 35.366 + gpl1e = map_domain_mem(g2mfn << PAGE_SHIFT); 35.367 + spl1e = map_domain_mem(s2mfn << PAGE_SHIFT); 35.368 35.369 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 35.370 - check_pte(d, gpl1e[i], spl1e[i], 1, i); 35.371 + check_pte(d, &gpl1e[i], &spl1e[i], 1, i); 35.372 35.373 unmap_domain_mem(spl1e); 35.374 unmap_domain_mem(gpl1e); 35.375 @@ -853,49 +859,46 @@ static int check_l1_table( 35.376 #define FAILPT(_f, _a...) \ 35.377 do { \ 35.378 printk("XXX FAIL %s-PT" _f "\n", s, ## _a ); \ 35.379 - BUG(); \ 35.380 + errors++; \ 35.381 } while ( 0 ) 35.382 35.383 -int check_pagetable(struct domain *d, pagetable_t pt, char *s) 35.384 +void check_pagetable(struct domain *d, pagetable_t pt, char *s) 35.385 { 35.386 unsigned long gptbase = pagetable_val(pt); 35.387 - unsigned long gpfn, spfn; 35.388 + unsigned long ptbase_pfn, smfn, ss; 35.389 unsigned long i; 35.390 l2_pgentry_t *gpl2e, *spl2e; 35.391 - unsigned long host_gpfn = 0; 35.392 + unsigned long ptbase_mfn = 0; 35.393 + int cpu = current->processor; 35.394 35.395 + errors = 0; 35.396 sh_check_name = s; 35.397 35.398 SH_VVLOG("%s-PT Audit", s); 35.399 35.400 sh_l2_present = sh_l1_present = 0; 35.401 35.402 - gpfn = gptbase >> PAGE_SHIFT; 35.403 + ptbase_pfn = gptbase >> PAGE_SHIFT; 35.404 + ptbase_mfn = __gpfn_to_mfn(d, ptbase_pfn); 35.405 35.406 - __get_phys_to_machine(d, host_gpfn, gpfn); 35.407 + ss = __shadow_status(d, ptbase_pfn); 35.408 35.409 - if ( ! (__shadow_status(d, gpfn) & PSH_shadowed) ) 35.410 + if ( ! (ss & PSH_shadowed) ) 35.411 { 35.412 printk("%s-PT %08lx not shadowed\n", s, gptbase); 35.413 35.414 - if( __shadow_status(d, gpfn) != 0 ) BUG(); 35.415 - return 0; 35.416 + if ( ss != 0 ) 35.417 + BUG(); 35.418 + return; 35.419 } 35.420 35.421 - spfn = __shadow_status(d, gpfn) & PSH_pfn_mask; 35.422 - 35.423 - if ( ! __shadow_status(d, gpfn) == (PSH_shadowed | spfn) ) 35.424 - FAILPT("ptbase shadow inconsistent1"); 35.425 + smfn = ss & PSH_pfn_mask; 35.426 35.427 - if (d->arch.shadow_mode == SHM_full_32) 35.428 - { 35.429 - host_gpfn = phys_to_machine_mapping[gpfn]; 35.430 - gpl2e = (l2_pgentry_t *) map_domain_mem( host_gpfn << PAGE_SHIFT ); 35.431 + if ( ss != (PSH_shadowed | smfn) ) 35.432 + FAILPT("ptbase shadow inconsistent1"); 35.433 35.434 - } else 35.435 - gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT ); 35.436 - 35.437 - spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); 35.438 + gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT ); 35.439 + spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT ); 35.440 35.441 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 35.442 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 35.443 @@ -916,40 +919,60 @@ int check_pagetable(struct domain *d, pa 35.444 35.445 if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> 35.446 L2_PAGETABLE_SHIFT]) != 35.447 - ((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) ) 35.448 + ((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) ) 35.449 FAILPT("hypervisor shadow linear map inconsistent %08lx %08lx", 35.450 l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> 35.451 L2_PAGETABLE_SHIFT]), 35.452 - (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); 35.453 + (smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); 35.454 35.455 - if (d->arch.shadow_mode != SHM_full_32) { 35.456 + if ( shadow_mode(d) != SHM_full_32 ) { 35.457 + // BUG: this shouldn't be using exec_domain[0] here... 35.458 if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) != 35.459 - ((__pa(page_get_owner(&frame_table[gpfn])->arch.mm_perdomain_pt) | 35.460 + ((__pa(page_get_owner(&frame_table[ptbase_pfn])->arch.mm_perdomain_pt) | 35.461 __PAGE_HYPERVISOR))) ) 35.462 FAILPT("hypervisor per-domain map inconsistent"); 35.463 } 35.464 35.465 /* Check the whole L2. */ 35.466 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 35.467 - check_pte(d, l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]), 2, i); 35.468 + check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i); 35.469 35.470 /* Go back and recurse. */ 35.471 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 35.472 { 35.473 + unsigned long gl1pfn = l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT; 35.474 + unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn); 35.475 + unsigned long sl1mfn = l2_pgentry_val(spl2e[i]) >> PAGE_SHIFT; 35.476 + 35.477 if ( l2_pgentry_val(spl2e[i]) != 0 ) 35.478 - check_l1_table( 35.479 - d, i << L2_PAGETABLE_SHIFT, 35.480 - l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT, 35.481 - l2_pgentry_val(spl2e[i]) >> PAGE_SHIFT); 35.482 + { 35.483 + // First check to see if this guest page is currently the active 35.484 + // PTWR page. If so, then we compare the (old) cached copy of the 35.485 + // guest page to the shadow, and not the currently writable (and 35.486 + // thus potentially out-of-sync) guest page. 35.487 + // 35.488 + if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va && 35.489 + (i == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx) && 35.490 + likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) 35.491 + { 35.492 + gl1mfn = (__pa(ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].page) >> 35.493 + PAGE_SHIFT); 35.494 + } 35.495 + 35.496 + check_l1_table(d, gl1mfn, sl1mfn); 35.497 + } 35.498 } 35.499 35.500 unmap_domain_mem(spl2e); 35.501 unmap_domain_mem(gpl2e); 35.502 35.503 - SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n", 35.504 + SH_VVLOG("PT verified : l2_present = %d, l1_present = %d", 35.505 sh_l2_present, sh_l1_present); 35.506 35.507 - return 1; 35.508 + if ( errors ) 35.509 + BUG(); 35.510 + 35.511 + return; 35.512 } 35.513 35.514 -#endif 35.515 +#endif // SHADOW_DEBUG
36.1 --- a/xen/arch/x86/smpboot.c Mon Feb 07 08:19:24 2005 +0000 36.2 +++ b/xen/arch/x86/smpboot.c Tue Feb 08 16:44:16 2005 +0000 36.3 @@ -388,33 +388,27 @@ static int cpucount; 36.4 void __init start_secondary(void) 36.5 { 36.6 unsigned int cpu = cpucount; 36.7 - /* 6 bytes suitable for passing to LIDT instruction. */ 36.8 - unsigned char idt_load[6]; 36.9 36.10 + extern void percpu_traps_init(void); 36.11 extern void cpu_init(void); 36.12 36.13 set_current(idle_task[cpu]); 36.14 36.15 /* 36.16 - * Dont put anything before smp_callin(), SMP 36.17 - * booting is too fragile that we want to limit the 36.18 - * things done here to the most necessary things. 36.19 - */ 36.20 - cpu_init(); 36.21 - smp_callin(); 36.22 - 36.23 - while (!atomic_read(&smp_commenced)) 36.24 - rep_nop(); 36.25 - 36.26 - /* 36.27 * At this point, boot CPU has fully initialised the IDT. It is 36.28 * now safe to make ourselves a private copy. 36.29 */ 36.30 idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES); 36.31 memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES*sizeof(idt_entry_t)); 36.32 - *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1; 36.33 - *(unsigned long *)(&idt_load[2]) = (unsigned long)idt_tables[cpu]; 36.34 - __asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) ); 36.35 + 36.36 + percpu_traps_init(); 36.37 + 36.38 + cpu_init(); 36.39 + 36.40 + smp_callin(); 36.41 + 36.42 + while (!atomic_read(&smp_commenced)) 36.43 + rep_nop(); 36.44 36.45 /* 36.46 * low-memory mappings have been cleared, flush them from the local TLBs
37.1 --- a/xen/arch/x86/traps.c Mon Feb 07 08:19:24 2005 +0000 37.2 +++ b/xen/arch/x86/traps.c Tue Feb 08 16:44:16 2005 +0000 37.3 @@ -149,6 +149,11 @@ static inline int do_trap(int trapnr, ch 37.4 if ( !GUEST_FAULT(regs) ) 37.5 goto xen_fault; 37.6 37.7 +#ifndef NDEBUG 37.8 + if ( (ed->arch.traps[trapnr].address == 0) && (ed->domain->id == 0) ) 37.9 + goto xen_fault; 37.10 +#endif 37.11 + 37.12 ti = current->arch.traps + trapnr; 37.13 tb->flags = TBF_EXCEPTION; 37.14 tb->cs = ti->cs; 37.15 @@ -267,6 +272,12 @@ asmlinkage int do_page_fault(struct xen_ 37.16 37.17 perfc_incrc(page_faults); 37.18 37.19 +#if 0 37.20 + printk("do_page_fault(addr=0x%08lx, error_code=%d)\n", 37.21 + addr, regs->error_code); 37.22 + show_registers(regs); 37.23 +#endif 37.24 + 37.25 if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) 37.26 { 37.27 LOCK_BIGLOCK(d); 37.28 @@ -314,6 +325,11 @@ asmlinkage int do_page_fault(struct xen_ 37.29 if ( !GUEST_FAULT(regs) ) 37.30 goto xen_fault; 37.31 37.32 +#ifndef NDEBUG 37.33 + if ( (ed->arch.traps[TRAP_page_fault].address == 0) && (d->id == 0) ) 37.34 + goto xen_fault; 37.35 +#endif 37.36 + 37.37 propagate_page_fault(addr, regs->error_code); 37.38 return 0; 37.39 37.40 @@ -512,7 +528,7 @@ asmlinkage int do_general_protection(str 37.41 37.42 /* Emulate some simple privileged instructions when exec'ed in ring 1. */ 37.43 if ( (regs->error_code == 0) && 37.44 - RING_1(regs) && 37.45 + GUESTOS_FAULT(regs) && 37.46 emulate_privileged_op(regs) ) 37.47 return 0; 37.48 37.49 @@ -523,6 +539,12 @@ asmlinkage int do_general_protection(str 37.50 return 0; 37.51 #endif 37.52 37.53 +#ifndef NDEBUG 37.54 + if ( (ed->arch.traps[TRAP_gp_fault].address == 0) && 37.55 + (ed->domain->id == 0) ) 37.56 + goto gp_in_kernel; 37.57 +#endif 37.58 + 37.59 /* Pass on GPF as is. */ 37.60 ti = current->arch.traps + 13; 37.61 tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; 37.62 @@ -553,19 +575,55 @@ asmlinkage int do_general_protection(str 37.63 return 0; 37.64 } 37.65 37.66 +unsigned long nmi_softirq_reason; 37.67 +static void nmi_softirq(void) 37.68 +{ 37.69 + if ( dom0 == NULL ) 37.70 + return; 37.71 + 37.72 + if ( test_and_clear_bit(0, &nmi_softirq_reason) ) 37.73 + send_guest_virq(dom0->exec_domain[0], VIRQ_PARITY_ERR); 37.74 + 37.75 + if ( test_and_clear_bit(1, &nmi_softirq_reason) ) 37.76 + send_guest_virq(dom0->exec_domain[0], VIRQ_IO_ERR); 37.77 +} 37.78 + 37.79 asmlinkage void mem_parity_error(struct xen_regs *regs) 37.80 { 37.81 - console_force_unlock(); 37.82 - printk("\n\nNMI - MEMORY ERROR\n"); 37.83 - fatal_trap(TRAP_nmi, regs); 37.84 + /* Clear and disable the parity-error line. */ 37.85 + outb((inb(0x61)&15)|4,0x61); 37.86 + 37.87 + switch ( opt_nmi[0] ) 37.88 + { 37.89 + case 'd': /* 'dom0' */ 37.90 + set_bit(0, &nmi_softirq_reason); 37.91 + raise_softirq(NMI_SOFTIRQ); 37.92 + case 'i': /* 'ignore' */ 37.93 + break; 37.94 + default: /* 'fatal' */ 37.95 + console_force_unlock(); 37.96 + printk("\n\nNMI - MEMORY ERROR\n"); 37.97 + fatal_trap(TRAP_nmi, regs); 37.98 + } 37.99 } 37.100 37.101 asmlinkage void io_check_error(struct xen_regs *regs) 37.102 { 37.103 - console_force_unlock(); 37.104 + /* Clear and disable the I/O-error line. */ 37.105 + outb((inb(0x61)&15)|8,0x61); 37.106 37.107 - printk("\n\nNMI - I/O ERROR\n"); 37.108 - fatal_trap(TRAP_nmi, regs); 37.109 + switch ( opt_nmi[0] ) 37.110 + { 37.111 + case 'd': /* 'dom0' */ 37.112 + set_bit(0, &nmi_softirq_reason); 37.113 + raise_softirq(NMI_SOFTIRQ); 37.114 + case 'i': /* 'ignore' */ 37.115 + break; 37.116 + default: /* 'fatal' */ 37.117 + console_force_unlock(); 37.118 + printk("\n\nNMI - I/O ERROR\n"); 37.119 + fatal_trap(TRAP_nmi, regs); 37.120 + } 37.121 } 37.122 37.123 static void unknown_nmi_error(unsigned char reason) 37.124 @@ -579,27 +637,17 @@ asmlinkage void do_nmi(struct xen_regs * 37.125 { 37.126 ++nmi_count(smp_processor_id()); 37.127 37.128 -#if CONFIG_X86_LOCAL_APIC 37.129 if ( nmi_watchdog ) 37.130 nmi_watchdog_tick(regs); 37.131 - else 37.132 -#endif 37.133 + 37.134 + if ( reason & 0x80 ) 37.135 + mem_parity_error(regs); 37.136 + else if ( reason & 0x40 ) 37.137 + io_check_error(regs); 37.138 + else if ( !nmi_watchdog ) 37.139 unknown_nmi_error((unsigned char)(reason&0xff)); 37.140 } 37.141 37.142 -unsigned long nmi_softirq_reason; 37.143 -static void nmi_softirq(void) 37.144 -{ 37.145 - if ( dom0 == NULL ) 37.146 - return; 37.147 - 37.148 - if ( test_and_clear_bit(0, &nmi_softirq_reason) ) 37.149 - send_guest_virq(dom0->exec_domain[0], VIRQ_PARITY_ERR); 37.150 - 37.151 - if ( test_and_clear_bit(1, &nmi_softirq_reason) ) 37.152 - send_guest_virq(dom0->exec_domain[0], VIRQ_IO_ERR); 37.153 -} 37.154 - 37.155 asmlinkage int math_state_restore(struct xen_regs *regs) 37.156 { 37.157 /* Prevent recursion. */ 37.158 @@ -706,8 +754,8 @@ void set_tss_desc(unsigned int n, void * 37.159 37.160 void __init trap_init(void) 37.161 { 37.162 - extern void doublefault_init(void); 37.163 - doublefault_init(); 37.164 + extern void percpu_traps_init(void); 37.165 + extern void cpu_init(void); 37.166 37.167 /* 37.168 * Note that interrupt gates are always used, rather than trap gates. We 37.169 @@ -745,13 +793,9 @@ void __init trap_init(void) 37.170 /* CPU0 uses the master IDT. */ 37.171 idt_tables[0] = idt_table; 37.172 37.173 - /* 37.174 - * Should be a barrier for any external CPU state. 37.175 - */ 37.176 - { 37.177 - extern void cpu_init(void); 37.178 - cpu_init(); 37.179 - } 37.180 + percpu_traps_init(); 37.181 + 37.182 + cpu_init(); 37.183 37.184 open_softirq(NMI_SOFTIRQ, nmi_softirq); 37.185 } 37.186 @@ -769,8 +813,8 @@ long do_set_trap_table(trap_info_t *trap 37.187 if ( hypercall_preempt_check() ) 37.188 { 37.189 UNLOCK_BIGLOCK(current->domain); 37.190 - return hypercall_create_continuation( 37.191 - __HYPERVISOR_set_trap_table, 1, traps); 37.192 + return hypercall1_create_continuation( 37.193 + __HYPERVISOR_set_trap_table, traps); 37.194 } 37.195 37.196 if ( copy_from_user(&cur, traps, sizeof(cur)) ) return -EFAULT; 37.197 @@ -816,6 +860,13 @@ long do_fpu_taskswitch(void) 37.198 } 37.199 37.200 37.201 +#if defined(__i386__) 37.202 +#define DB_VALID_ADDR(_a) \ 37.203 + ((_a) <= (PAGE_OFFSET - 4)) 37.204 +#elif defined(__x86_64__) 37.205 +#define DB_VALID_ADDR(_a) \ 37.206 + ((_a) >= HYPERVISOR_VIRT_END) || ((_a) <= (HYPERVISOR_VIRT_START-8)) 37.207 +#endif 37.208 long set_debugreg(struct exec_domain *p, int reg, unsigned long value) 37.209 { 37.210 int i; 37.211 @@ -823,22 +874,22 @@ long set_debugreg(struct exec_domain *p, 37.212 switch ( reg ) 37.213 { 37.214 case 0: 37.215 - if ( value > (PAGE_OFFSET-4) ) return -EPERM; 37.216 + if ( !DB_VALID_ADDR(value) ) return -EPERM; 37.217 if ( p == current ) 37.218 __asm__ ( "mov %0, %%db0" : : "r" (value) ); 37.219 break; 37.220 case 1: 37.221 - if ( value > (PAGE_OFFSET-4) ) return -EPERM; 37.222 + if ( !DB_VALID_ADDR(value) ) return -EPERM; 37.223 if ( p == current ) 37.224 __asm__ ( "mov %0, %%db1" : : "r" (value) ); 37.225 break; 37.226 case 2: 37.227 - if ( value > (PAGE_OFFSET-4) ) return -EPERM; 37.228 + if ( !DB_VALID_ADDR(value) ) return -EPERM; 37.229 if ( p == current ) 37.230 __asm__ ( "mov %0, %%db2" : : "r" (value) ); 37.231 break; 37.232 case 3: 37.233 - if ( value > (PAGE_OFFSET-4) ) return -EPERM; 37.234 + if ( !DB_VALID_ADDR(value) ) return -EPERM; 37.235 if ( p == current ) 37.236 __asm__ ( "mov %0, %%db3" : : "r" (value) ); 37.237 break;
38.1 --- a/xen/arch/x86/vmx.c Mon Feb 07 08:19:24 2005 +0000 38.2 +++ b/xen/arch/x86/vmx.c Tue Feb 08 16:44:16 2005 +0000 38.3 @@ -36,6 +36,8 @@ 38.4 #include <asm/vmx_vmcs.h> 38.5 #include <public/io/ioreq.h> 38.6 38.7 +#ifdef CONFIG_VMX 38.8 + 38.9 int vmcs_size; 38.10 unsigned int opt_vmx_debug_level; 38.11 38.12 @@ -123,13 +125,13 @@ static int vmx_do_page_fault(unsigned lo 38.13 /* 38.14 * Set up guest page directory cache to make linear_pt_table[] work. 38.15 */ 38.16 - __guest_get_pl2e(ed, va, &gpde); 38.17 + __guest_get_l2e(ed, va, &gpde); 38.18 if (!(gpde & _PAGE_PRESENT)) 38.19 return 0; 38.20 38.21 index = (va >> L2_PAGETABLE_SHIFT); 38.22 if (!l2_pgentry_val(ed->arch.guest_pl2e_cache[index])) { 38.23 - pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT]; 38.24 + pfn = phys_to_machine_mapping(gpde >> PAGE_SHIFT); 38.25 38.26 VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_do_page_fault: pagetable = %lx\n", 38.27 pagetable_val(ed->arch.pagetable)); 38.28 @@ -301,10 +303,10 @@ inline unsigned long gva_to_gpa(unsigned 38.29 unsigned long gpde, gpte, pfn, index; 38.30 struct exec_domain *ed = current; 38.31 38.32 - __guest_get_pl2e(ed, gva, &gpde); 38.33 + __guest_get_l2e(ed, gva, &gpde); 38.34 index = (gva >> L2_PAGETABLE_SHIFT); 38.35 38.36 - pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT]; 38.37 + pfn = phys_to_machine_mapping(gpde >> PAGE_SHIFT); 38.38 38.39 ed->arch.guest_pl2e_cache[index] = 38.40 mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); 38.41 @@ -451,8 +453,8 @@ static void mov_to_cr(int gp, int cr, st 38.42 /* 38.43 * The guest CR3 must be pointing to the guest physical. 38.44 */ 38.45 - if (!(pfn = phys_to_machine_mapping[ 38.46 - d->arch.arch_vmx.cpu_cr3 >> PAGE_SHIFT])) 38.47 + if (!(pfn = phys_to_machine_mapping( 38.48 + d->arch.arch_vmx.cpu_cr3 >> PAGE_SHIFT))) 38.49 { 38.50 VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value = %lx\n", 38.51 d->arch.arch_vmx.cpu_cr3); 38.52 @@ -504,7 +506,7 @@ static void mov_to_cr(int gp, int cr, st 38.53 * removed some translation or changed page attributes. 38.54 * We simply invalidate the shadow. 38.55 */ 38.56 - pfn = phys_to_machine_mapping[value >> PAGE_SHIFT]; 38.57 + pfn = phys_to_machine_mapping(value >> PAGE_SHIFT); 38.58 if ((pfn << PAGE_SHIFT) != pagetable_val(d->arch.pagetable)) 38.59 __vmx_bug(regs); 38.60 vmx_shadow_clear_state(d->domain); 38.61 @@ -521,7 +523,7 @@ static void mov_to_cr(int gp, int cr, st 38.62 "Invalid CR3 value=%lx\n", value); 38.63 domain_crash(); /* need to take a clean path */ 38.64 } 38.65 - pfn = phys_to_machine_mapping[value >> PAGE_SHIFT]; 38.66 + pfn = phys_to_machine_mapping(value >> PAGE_SHIFT); 38.67 vmx_shadow_clear_state(d->domain); 38.68 d->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT); 38.69 shadow_mk_pagetable(d); 38.70 @@ -927,6 +929,8 @@ asmlinkage void vmx_vmexit_handler(struc 38.71 default: 38.72 __vmx_bug(®s); /* should not happen */ 38.73 } 38.74 + 38.75 + vmx_intr_assist(d); 38.76 return; 38.77 } 38.78 38.79 @@ -937,3 +941,5 @@ asmlinkage void load_cr2(void) 38.80 local_irq_disable(); 38.81 asm volatile("movl %0,%%cr2": :"r" (d->arch.arch_vmx.cpu_cr2)); 38.82 } 38.83 + 38.84 +#endif /* CONFIG_VMX */
39.1 --- a/xen/arch/x86/vmx_io.c Mon Feb 07 08:19:24 2005 +0000 39.2 +++ b/xen/arch/x86/vmx_io.c Tue Feb 08 16:44:16 2005 +0000 39.3 @@ -32,6 +32,8 @@ 39.4 #include <public/io/ioreq.h> 39.5 #include <asm/vmx_platform.h> 39.6 39.7 +#ifdef CONFIG_VMX 39.8 + 39.9 extern long do_block(); 39.10 39.11 #if defined (__i386__) 39.12 @@ -386,3 +388,5 @@ void vmx_do_resume(struct exec_domain *d 39.13 if (!test_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags)) 39.14 vmx_intr_assist(d); 39.15 } 39.16 + 39.17 +#endif /* CONFIG_VMX */
40.1 --- a/xen/arch/x86/vmx_platform.c Mon Feb 07 08:19:24 2005 +0000 40.2 +++ b/xen/arch/x86/vmx_platform.c Tue Feb 08 16:44:16 2005 +0000 40.3 @@ -34,6 +34,8 @@ 40.4 #include <xen/sched.h> 40.5 #include <asm/current.h> 40.6 40.7 +#ifdef CONFIG_VMX 40.8 + 40.9 #define DECODE_success 1 40.10 #define DECODE_failure 0 40.11 40.12 @@ -369,7 +371,7 @@ static int inst_copy_from_guest(char *bu 40.13 printk("inst_copy_from_guest- EXIT: read gpte faulted" ); 40.14 return 0; 40.15 } 40.16 - mfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT]; 40.17 + mfn = phys_to_machine_mapping(gpte >> PAGE_SHIFT); 40.18 ma = (mfn << PAGE_SHIFT) | (guest_eip & (PAGE_SIZE - 1)); 40.19 inst_start = (unsigned char *)map_domain_mem(ma); 40.20 40.21 @@ -553,3 +555,4 @@ void handle_mmio(unsigned long va, unsig 40.22 domain_crash(); 40.23 } 40.24 40.25 +#endif /* CONFIG_VMX */
41.1 --- a/xen/arch/x86/vmx_vmcs.c Mon Feb 07 08:19:24 2005 +0000 41.2 +++ b/xen/arch/x86/vmx_vmcs.c Tue Feb 08 16:44:16 2005 +0000 41.3 @@ -33,6 +33,8 @@ 41.4 #include <public/io/ioreq.h> 41.5 #include <asm/domain_page.h> 41.6 41.7 +#ifdef CONFIG_VMX 41.8 + 41.9 struct vmcs_struct *alloc_vmcs(void) 41.10 { 41.11 struct vmcs_struct *vmcs; 41.12 @@ -118,7 +120,7 @@ int vmx_setup_platform(struct exec_domai 41.13 addr = context->edi; 41.14 offset = (addr & ~PAGE_MASK); 41.15 addr = round_pgdown(addr); 41.16 - mpfn = phys_to_machine_mapping[addr >> PAGE_SHIFT]; 41.17 + mpfn = phys_to_machine_mapping(addr >> PAGE_SHIFT); 41.18 p = map_domain_mem(mpfn << PAGE_SHIFT); 41.19 41.20 e820p = (struct e820entry *) ((unsigned long) p + offset); 41.21 @@ -131,52 +133,20 @@ int vmx_setup_platform(struct exec_domai 41.22 } 41.23 41.24 if (gpfn == 0) { 41.25 - VMX_DBG_LOG(DBG_LEVEL_1, "No shared Page ?\n"); 41.26 + printk("No shared Page ?\n"); 41.27 + unmap_domain_mem(p); 41.28 return -1; 41.29 } 41.30 unmap_domain_mem(p); 41.31 41.32 - mpfn = phys_to_machine_mapping[gpfn]; 41.33 + mpfn = phys_to_machine_mapping(gpfn); 41.34 p = map_domain_mem(mpfn << PAGE_SHIFT); 41.35 + ASSERT(p != NULL); 41.36 d->arch.arch_vmx.vmx_platform.shared_page_va = (unsigned long) p; 41.37 41.38 return 0; 41.39 } 41.40 41.41 - 41.42 -/* 41.43 - * Add <guest pfn, machine pfn> mapping to per-domain mapping. Full 41.44 - * virtualization does not need per-domain mapping. 41.45 - */ 41.46 -static int add_mapping_perdomain(struct exec_domain *d, unsigned long gpfn, 41.47 - unsigned long mpfn) 41.48 -{ 41.49 - struct pfn_info *page; 41.50 - unsigned long pfn = 0; 41.51 - 41.52 - /* 41.53 - * We support up to 4GB memory for a guest at this point 41.54 - */ 41.55 - if (gpfn > ENTRIES_PER_L2_PAGETABLE * ENTRIES_PER_L1_PAGETABLE) 41.56 - return -1; 41.57 - 41.58 - if (!(l1_pgentry_val(d->domain->arch.mm_perdomain_pt[ 41.59 - gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)]) & _PAGE_PRESENT)) 41.60 - { 41.61 - page = (struct pfn_info *) alloc_domheap_page(NULL); 41.62 - if (!page) { 41.63 - return -1; 41.64 - } 41.65 - 41.66 - pfn = (unsigned long) (page - frame_table); 41.67 - d->domain->arch.mm_perdomain_pt[gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)] = 41.68 - mk_l1_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); 41.69 - } 41.70 - phys_to_machine_mapping[gpfn] = mpfn; 41.71 - 41.72 - return 0; 41.73 -} 41.74 - 41.75 void vmx_do_launch(struct exec_domain *ed) 41.76 { 41.77 /* Update CR3, GDT, LDT, TR */ 41.78 @@ -204,7 +174,6 @@ void vmx_do_launch(struct exec_domain *e 41.79 d->arch.min_pfn = min(d->arch.min_pfn, pfn); 41.80 d->arch.max_pfn = max(d->arch.max_pfn, pfn); 41.81 list_ent = frame_table[pfn].list.next; 41.82 - add_mapping_perdomain(ed, i, pfn); 41.83 } 41.84 41.85 spin_unlock(&d->page_alloc_lock); 41.86 @@ -502,3 +471,4 @@ void vm_resume_fail(unsigned long eflags 41.87 BUG(); 41.88 } 41.89 41.90 +#endif /* CONFIG_VMX */
42.1 --- a/xen/arch/x86/x86_32/domain_build.c Mon Feb 07 08:19:24 2005 +0000 42.2 +++ b/xen/arch/x86/x86_32/domain_build.c Tue Feb 08 16:44:16 2005 +0000 42.3 @@ -20,6 +20,7 @@ 42.4 #include <xen/event.h> 42.5 #include <xen/elf.h> 42.6 #include <xen/kernel.h> 42.7 +#include <asm/shadow.h> 42.8 42.9 /* No ring-3 access in initial page tables. */ 42.10 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) 42.11 @@ -261,7 +262,7 @@ int construct_dom0(struct domain *d, 42.12 for ( count = 0; count < nr_pt_pages; count++ ) 42.13 { 42.14 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); 42.15 - page = &frame_table[l1_pgentry_to_pagenr(*l1tab)]; 42.16 + page = &frame_table[l1_pgentry_to_pfn(*l1tab)]; 42.17 if ( count == 0 ) 42.18 { 42.19 page->u.inuse.type_info &= ~PGT_type_mask; 42.20 @@ -377,10 +378,13 @@ int construct_dom0(struct domain *d, 42.21 42.22 new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start); 42.23 42.24 -#if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */ 42.25 - shadow_lock(&d->mm); 42.26 - shadow_mode_enable(d, SHM_test); 42.27 - shadow_unlock(&d->mm); 42.28 +#ifndef NDEBUG 42.29 + if (0) /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */ 42.30 + { 42.31 + shadow_lock(d); 42.32 + shadow_mode_enable(d, SHM_test); 42.33 + shadow_unlock(d); 42.34 + } 42.35 #endif 42.36 42.37 return 0;
43.1 --- a/xen/arch/x86/x86_32/entry.S Mon Feb 07 08:19:24 2005 +0000 43.2 +++ b/xen/arch/x86/x86_32/entry.S Tue Feb 08 16:44:16 2005 +0000 43.3 @@ -596,7 +596,7 @@ ENTRY(nmi) 43.4 # Okay, its almost a normal NMI tick. We can only process it if: 43.5 # A. We are the outermost Xen activation (in which case we have 43.6 # the selectors safely saved on our stack) 43.7 - # B. DS-GS all contain sane Xen values. 43.8 + # B. DS and ES contain sane Xen values. 43.9 # In all other cases we bail without touching DS-GS, as we have 43.10 # interrupted an enclosing Xen activation in tricky prologue or 43.11 # epilogue code. 43.12 @@ -644,11 +644,11 @@ nmi_parity_err: 43.13 orb $0x4,%al 43.14 outb %al,$0x61 43.15 cmpb $'i',%ss:SYMBOL_NAME(opt_nmi) # nmi=ignore 43.16 - je restore_all_xen 43.17 + je nmi_out 43.18 bts $0,%ss:SYMBOL_NAME(nmi_softirq_reason) 43.19 bts $NMI_SOFTIRQ,%ss:SYMBOL_NAME(irq_stat) 43.20 cmpb $'d',%ss:SYMBOL_NAME(opt_nmi) # nmi=dom0 43.21 - je restore_all_xen 43.22 + je nmi_out 43.23 movl $(__HYPERVISOR_DS),%edx # nmi=fatal 43.24 movl %edx,%ds 43.25 movl %edx,%es 43.26 @@ -656,7 +656,15 @@ nmi_parity_err: 43.27 push %edx 43.28 call SYMBOL_NAME(mem_parity_error) 43.29 addl $4,%esp 43.30 - jmp ret_from_intr 43.31 +nmi_out:movl %ss:XREGS_eflags(%esp),%eax 43.32 + movb %ss:XREGS_cs(%esp),%al 43.33 + testl $(3|X86_EFLAGS_VM),%eax 43.34 + jz restore_all_xen 43.35 + movl $(__HYPERVISOR_DS),%edx 43.36 + movl %edx,%ds 43.37 + movl %edx,%es 43.38 + GET_CURRENT(%ebx) 43.39 + jmp test_all_events 43.40 43.41 nmi_io_err: 43.42 # Clear and disable the I/O-error line 43.43 @@ -664,11 +672,11 @@ nmi_io_err: 43.44 orb $0x8,%al 43.45 outb %al,$0x61 43.46 cmpb $'i',%ss:SYMBOL_NAME(opt_nmi) # nmi=ignore 43.47 - je restore_all_xen 43.48 + je nmi_out 43.49 bts $1,%ss:SYMBOL_NAME(nmi_softirq_reason) 43.50 bts $NMI_SOFTIRQ,%ss:SYMBOL_NAME(irq_stat) 43.51 cmpb $'d',%ss:SYMBOL_NAME(opt_nmi) # nmi=dom0 43.52 - je restore_all_xen 43.53 + je nmi_out 43.54 movl $(__HYPERVISOR_DS),%edx # nmi=fatal 43.55 movl %edx,%ds 43.56 movl %edx,%es 43.57 @@ -676,7 +684,7 @@ nmi_io_err: 43.58 push %edx 43.59 call SYMBOL_NAME(io_check_error) 43.60 addl $4,%esp 43.61 - jmp ret_from_intr 43.62 + jmp nmi_out 43.63 43.64 43.65 ENTRY(setup_vm86_frame)
44.1 --- a/xen/arch/x86/x86_32/mm.c Mon Feb 07 08:19:24 2005 +0000 44.2 +++ b/xen/arch/x86/x86_32/mm.c Tue Feb 08 16:44:16 2005 +0000 44.3 @@ -164,7 +164,7 @@ void subarch_init_memory(struct domain * 44.4 } 44.5 44.6 /* M2P table is mappable read-only by privileged domains. */ 44.7 - m2p_start_mfn = l2_pgentry_to_pagenr( 44.8 + m2p_start_mfn = l2_pgentry_to_pfn( 44.9 idle_pg_table[l2_table_offset(RDWR_MPT_VIRT_START)]); 44.10 for ( i = 0; i < 1024; i++ ) 44.11 { 44.12 @@ -212,9 +212,10 @@ long do_stack_switch(unsigned long ss, u 44.13 44.14 44.15 /* Returns TRUE if given descriptor is valid for GDT or LDT. */ 44.16 -int check_descriptor(unsigned long *d) 44.17 +int check_descriptor(struct desc_struct *d) 44.18 { 44.19 - unsigned long base, limit, a = d[0], b = d[1]; 44.20 + unsigned long base, limit; 44.21 + u32 a = d->a, b = d->b; 44.22 44.23 /* A not-present descriptor will always fault, so is safe. */ 44.24 if ( !(b & _SEGMENT_P) ) 44.25 @@ -298,8 +299,8 @@ int check_descriptor(unsigned long *d) 44.26 if ( !(b & _SEGMENT_G) ) 44.27 goto bad; /* too dangerous; too hard to work out... */ 44.28 limit = (limit >> 12) - 1; 44.29 - d[0] &= ~0x0ffff; d[0] |= limit & 0x0ffff; 44.30 - d[1] &= ~0xf0000; d[1] |= limit & 0xf0000; 44.31 + d->a &= ~0x0ffff; d->a |= limit & 0x0ffff; 44.32 + d->b &= ~0xf0000; d->b |= limit & 0xf0000; 44.33 } 44.34 } 44.35 44.36 @@ -310,175 +311,6 @@ int check_descriptor(unsigned long *d) 44.37 } 44.38 44.39 44.40 -void destroy_gdt(struct exec_domain *ed) 44.41 -{ 44.42 - int i; 44.43 - unsigned long pfn; 44.44 - 44.45 - for ( i = 0; i < 16; i++ ) 44.46 - { 44.47 - if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 ) 44.48 - put_page_and_type(&frame_table[pfn]); 44.49 - ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0); 44.50 - } 44.51 -} 44.52 - 44.53 - 44.54 -long set_gdt(struct exec_domain *ed, 44.55 - unsigned long *frames, 44.56 - unsigned int entries) 44.57 -{ 44.58 - struct domain *d = ed->domain; 44.59 - /* NB. There are 512 8-byte entries per GDT page. */ 44.60 - int i = 0, nr_pages = (entries + 511) / 512; 44.61 - struct desc_struct *vgdt; 44.62 - unsigned long pfn; 44.63 - 44.64 - /* Check the first page in the new GDT. */ 44.65 - if ( (pfn = frames[0]) >= max_page ) 44.66 - goto fail; 44.67 - 44.68 - /* The first page is special because Xen owns a range of entries in it. */ 44.69 - if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 44.70 - { 44.71 - /* GDT checks failed: try zapping the Xen reserved entries. */ 44.72 - if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) ) 44.73 - goto fail; 44.74 - vgdt = map_domain_mem(pfn << PAGE_SHIFT); 44.75 - memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0, 44.76 - NR_RESERVED_GDT_ENTRIES*8); 44.77 - unmap_domain_mem(vgdt); 44.78 - put_page_and_type(&frame_table[pfn]); 44.79 - 44.80 - /* Okay, we zapped the entries. Now try the GDT checks again. */ 44.81 - if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 44.82 - goto fail; 44.83 - } 44.84 - 44.85 - /* Check the remaining pages in the new GDT. */ 44.86 - for ( i = 1; i < nr_pages; i++ ) 44.87 - if ( ((pfn = frames[i]) >= max_page) || 44.88 - !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 44.89 - goto fail; 44.90 - 44.91 - /* Copy reserved GDT entries to the new GDT. */ 44.92 - vgdt = map_domain_mem(frames[0] << PAGE_SHIFT); 44.93 - memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 44.94 - gdt_table + FIRST_RESERVED_GDT_ENTRY, 44.95 - NR_RESERVED_GDT_ENTRIES*8); 44.96 - unmap_domain_mem(vgdt); 44.97 - 44.98 - /* Tear down the old GDT. */ 44.99 - destroy_gdt(ed); 44.100 - 44.101 - /* Install the new GDT. */ 44.102 - for ( i = 0; i < nr_pages; i++ ) 44.103 - ed->arch.perdomain_ptes[i] = 44.104 - mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); 44.105 - 44.106 - SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed)); 44.107 - SET_GDT_ENTRIES(ed, entries); 44.108 - 44.109 - return 0; 44.110 - 44.111 - fail: 44.112 - while ( i-- > 0 ) 44.113 - put_page_and_type(&frame_table[frames[i]]); 44.114 - return -EINVAL; 44.115 -} 44.116 - 44.117 - 44.118 -long do_set_gdt(unsigned long *frame_list, unsigned int entries) 44.119 -{ 44.120 - int nr_pages = (entries + 511) / 512; 44.121 - unsigned long frames[16]; 44.122 - long ret; 44.123 - 44.124 - if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 44.125 - return -EINVAL; 44.126 - 44.127 - if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) ) 44.128 - return -EFAULT; 44.129 - 44.130 - LOCK_BIGLOCK(current->domain); 44.131 - 44.132 - if ( (ret = set_gdt(current, frames, entries)) == 0 ) 44.133 - { 44.134 - local_flush_tlb(); 44.135 - __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt)); 44.136 - } 44.137 - 44.138 - UNLOCK_BIGLOCK(current->domain); 44.139 - 44.140 - return ret; 44.141 -} 44.142 - 44.143 - 44.144 -long do_update_descriptor( 44.145 - unsigned long pa, unsigned long word1, unsigned long word2) 44.146 -{ 44.147 - unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT, d[2]; 44.148 - struct pfn_info *page; 44.149 - struct exec_domain *ed; 44.150 - long ret = -EINVAL; 44.151 - 44.152 - d[0] = word1; 44.153 - d[1] = word2; 44.154 - 44.155 - LOCK_BIGLOCK(current->domain); 44.156 - 44.157 - if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(d) ) { 44.158 - UNLOCK_BIGLOCK(current->domain); 44.159 - return -EINVAL; 44.160 - } 44.161 - 44.162 - page = &frame_table[pfn]; 44.163 - if ( unlikely(!get_page(page, current->domain)) ) { 44.164 - UNLOCK_BIGLOCK(current->domain); 44.165 - return -EINVAL; 44.166 - } 44.167 - 44.168 - /* Check if the given frame is in use in an unsafe context. */ 44.169 - switch ( page->u.inuse.type_info & PGT_type_mask ) 44.170 - { 44.171 - case PGT_gdt_page: 44.172 - /* Disallow updates of Xen-reserved descriptors in the current GDT. */ 44.173 - for_each_exec_domain(current->domain, ed) { 44.174 - if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) && 44.175 - (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && 44.176 - (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) 44.177 - goto out; 44.178 - } 44.179 - if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) 44.180 - goto out; 44.181 - break; 44.182 - case PGT_ldt_page: 44.183 - if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) 44.184 - goto out; 44.185 - break; 44.186 - default: 44.187 - if ( unlikely(!get_page_type(page, PGT_writable_page)) ) 44.188 - goto out; 44.189 - break; 44.190 - } 44.191 - 44.192 - /* All is good so make the update. */ 44.193 - gdt_pent = map_domain_mem(pa); 44.194 - memcpy(gdt_pent, d, 8); 44.195 - unmap_domain_mem(gdt_pent); 44.196 - 44.197 - put_page_type(page); 44.198 - 44.199 - ret = 0; /* success */ 44.200 - 44.201 - out: 44.202 - put_page(page); 44.203 - 44.204 - UNLOCK_BIGLOCK(current->domain); 44.205 - 44.206 - return ret; 44.207 -} 44.208 - 44.209 #ifdef MEMORY_GUARD 44.210 44.211 void *memguard_init(void *heap_start)
45.1 --- a/xen/arch/x86/x86_32/traps.c Mon Feb 07 08:19:24 2005 +0000 45.2 +++ b/xen/arch/x86/x86_32/traps.c Tue Feb 08 16:44:16 2005 +0000 45.3 @@ -7,6 +7,7 @@ 45.4 #include <xen/console.h> 45.5 #include <xen/mm.h> 45.6 #include <xen/irq.h> 45.7 +#include <asm/flushtlb.h> 45.8 45.9 static int kstack_depth_to_print = 8*20; 45.10 45.11 @@ -114,6 +115,7 @@ void show_registers(struct xen_regs *reg 45.12 regs->esi, regs->edi, regs->ebp, esp); 45.13 printk("ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", 45.14 ds, es, fs, gs, ss); 45.15 + printk("cr3: %08lx\n", read_cr3()); 45.16 45.17 show_stack((unsigned long *)®s->esp); 45.18 } 45.19 @@ -175,34 +177,33 @@ asmlinkage void do_double_fault(void) 45.20 __asm__ __volatile__ ( "hlt" ); 45.21 } 45.22 45.23 -void __init doublefault_init(void) 45.24 +void __init percpu_traps_init(void) 45.25 { 45.26 - /* 45.27 - * Make a separate task for double faults. This will get us debug output if 45.28 - * we blow the kernel stack. 45.29 - */ 45.30 - struct tss_struct *tss = &doublefault_tss; 45.31 - memset(tss, 0, sizeof(*tss)); 45.32 - tss->ds = __HYPERVISOR_DS; 45.33 - tss->es = __HYPERVISOR_DS; 45.34 - tss->ss = __HYPERVISOR_DS; 45.35 - tss->esp = (unsigned long) 45.36 - &doublefault_stack[DOUBLEFAULT_STACK_SIZE]; 45.37 - tss->__cr3 = __pa(idle_pg_table); 45.38 - tss->cs = __HYPERVISOR_CS; 45.39 - tss->eip = (unsigned long)do_double_fault; 45.40 - tss->eflags = 2; 45.41 - tss->bitmap = IOBMP_INVALID_OFFSET; 45.42 - _set_tssldt_desc(gdt_table+__DOUBLEFAULT_TSS_ENTRY, 45.43 - (unsigned long)tss, 235, 9); 45.44 + if ( smp_processor_id() == 0 ) 45.45 + { 45.46 + /* 45.47 + * Make a separate task for double faults. This will get us debug 45.48 + * output if we blow the kernel stack. 45.49 + */ 45.50 + struct tss_struct *tss = &doublefault_tss; 45.51 + memset(tss, 0, sizeof(*tss)); 45.52 + tss->ds = __HYPERVISOR_DS; 45.53 + tss->es = __HYPERVISOR_DS; 45.54 + tss->ss = __HYPERVISOR_DS; 45.55 + tss->esp = (unsigned long) 45.56 + &doublefault_stack[DOUBLEFAULT_STACK_SIZE]; 45.57 + tss->__cr3 = __pa(idle_pg_table); 45.58 + tss->cs = __HYPERVISOR_CS; 45.59 + tss->eip = (unsigned long)do_double_fault; 45.60 + tss->eflags = 2; 45.61 + tss->bitmap = IOBMP_INVALID_OFFSET; 45.62 + _set_tssldt_desc(gdt_table+__DOUBLEFAULT_TSS_ENTRY, 45.63 + (unsigned long)tss, 235, 9); 45.64 + } 45.65 45.66 set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3); 45.67 } 45.68 45.69 -void __init percpu_traps_init(void) 45.70 -{ 45.71 -} 45.72 - 45.73 long set_fast_trap(struct exec_domain *p, int idx) 45.74 { 45.75 trap_info_t *ti;
46.1 --- a/xen/arch/x86/x86_64/domain_build.c Mon Feb 07 08:19:24 2005 +0000 46.2 +++ b/xen/arch/x86/x86_64/domain_build.c Tue Feb 08 16:44:16 2005 +0000 46.3 @@ -294,7 +294,7 @@ int construct_dom0(struct domain *d, 46.4 for ( count = 0; count < nr_pt_pages; count++ ) 46.5 { 46.6 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); 46.7 - page = &frame_table[l1_pgentry_to_pagenr(*l1tab)]; 46.8 + page = &frame_table[l1_pgentry_to_pfn(*l1tab)]; 46.9 46.10 /* Read-only mapping + PGC_allocated + page-table page. */ 46.11 page->count_info = PGC_allocated | 3;
47.1 --- a/xen/arch/x86/x86_64/entry.S Mon Feb 07 08:19:24 2005 +0000 47.2 +++ b/xen/arch/x86/x86_64/entry.S Tue Feb 08 16:44:16 2005 +0000 47.3 @@ -11,16 +11,25 @@ 47.4 #include <asm/apicdef.h> 47.5 #include <public/xen.h> 47.6 47.7 + 47.8 +/* 47.9 + * %rax = hypercall vector 47.10 + * %rdi, %rsi, %rdx, %r10, %r8, %9 = hypercall arguments 47.11 + * %r11, %rcx = SYSCALL-saved %rflags and %rip 47.12 + * NB. We must move %r10 to %rcx for C function-calling ABI. 47.13 + */ 47.14 ENTRY(hypercall) 47.15 - movl $0x0833,8(%rsp) 47.16 + sti 47.17 + movl $__GUEST_SS,8(%rsp) 47.18 pushq %r11 47.19 - pushq $0x082b 47.20 + pushq $__GUEST_CS 47.21 pushq %rcx 47.22 pushq $0 47.23 SAVE_ALL 47.24 - andq $(NR_hypercalls-1),%rax 47.25 - leaq SYMBOL_NAME(exception_table)(%rip),%rcx 47.26 - callq *(%rcx,%rax,8) 47.27 + movq %r10,%rcx 47.28 + andq $(NR_hypercalls-1),%rax 47.29 + leaq SYMBOL_NAME(hypercall_table)(%rip),%rbx 47.30 + callq *(%rbx,%rax,8) 47.31 RESTORE_ALL 47.32 addq $8,%rsp 47.33 popq %rcx 47.34 @@ -38,11 +47,12 @@ restore_all_xen: 47.35 47.36 error_code: 47.37 SAVE_ALL 47.38 + sti 47.39 movq %rsp,%rdi 47.40 movl XREGS_entry_vector(%rsp),%eax 47.41 leaq SYMBOL_NAME(exception_table)(%rip),%rdx 47.42 callq *(%rdx,%rax,8) 47.43 - jmp restore_all_xen 47.44 + jmp restore_all_xen 47.45 47.46 ENTRY(divide_error) 47.47 pushq $0 47.48 @@ -133,7 +143,13 @@ ENTRY(double_fault) 47.49 jmp error_code 47.50 47.51 ENTRY(nmi) 47.52 - iretq 47.53 + pushq $0 47.54 + SAVE_ALL 47.55 + inb $0x61,%al 47.56 + movl %eax,%esi # reason 47.57 + movq %rsp,%rdi # regs 47.58 + call SYMBOL_NAME(do_nmi) 47.59 + jmp restore_all_xen 47.60 47.61 .data 47.62
48.1 --- a/xen/arch/x86/x86_64/mm.c Mon Feb 07 08:19:24 2005 +0000 48.2 +++ b/xen/arch/x86/x86_64/mm.c Tue Feb 08 16:44:16 2005 +0000 48.3 @@ -199,7 +199,7 @@ void subarch_init_memory(struct domain * 48.4 l2e = l3_pgentry_to_l2(l3e)[l2_table_offset(v)]; 48.5 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) 48.6 continue; 48.7 - m2p_start_mfn = l2_pgentry_to_pagenr(l2e); 48.8 + m2p_start_mfn = l2_pgentry_to_pfn(l2e); 48.9 48.10 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 48.11 { 48.12 @@ -240,99 +240,38 @@ long do_stack_switch(unsigned long ss, u 48.13 48.14 48.15 /* Returns TRUE if given descriptor is valid for GDT or LDT. */ 48.16 -int check_descriptor(unsigned long *d) 48.17 +int check_descriptor(struct desc_struct *d) 48.18 { 48.19 - unsigned long base, limit, a = d[0], b = d[1]; 48.20 + u32 a = d->a, b = d->b; 48.21 48.22 /* A not-present descriptor will always fault, so is safe. */ 48.23 if ( !(b & _SEGMENT_P) ) 48.24 goto good; 48.25 48.26 - /* 48.27 - * We don't allow a DPL of zero. There is no legitimate reason for 48.28 - * specifying DPL==0, and it gets rather dangerous if we also accept call 48.29 - * gates (consider a call gate pointing at another guestos descriptor with 48.30 - * DPL 0 -- this would get the OS ring-0 privileges). 48.31 - */ 48.32 - if ( (b & _SEGMENT_DPL) == 0 ) 48.33 + /* The guest can only safely be executed in ring 3. */ 48.34 + if ( (b & _SEGMENT_DPL) != 3 ) 48.35 goto bad; 48.36 48.37 - if ( !(b & _SEGMENT_S) ) 48.38 - { 48.39 - /* 48.40 - * System segment: 48.41 - * 1. Don't allow interrupt or trap gates as they belong in the IDT. 48.42 - * 2. Don't allow TSS descriptors or task gates as we don't 48.43 - * virtualise x86 tasks. 48.44 - * 3. Don't allow LDT descriptors because they're unnecessary and 48.45 - * I'm uneasy about allowing an LDT page to contain LDT 48.46 - * descriptors. In any case, Xen automatically creates the 48.47 - * required descriptor when reloading the LDT register. 48.48 - * 4. We allow call gates but they must not jump to a private segment. 48.49 - */ 48.50 - 48.51 - /* Disallow everything but call gates. */ 48.52 - if ( (b & _SEGMENT_TYPE) != 0xc00 ) 48.53 - goto bad; 48.54 + /* Any code or data segment is okay. No base/limit checking. */ 48.55 + if ( (b & _SEGMENT_S) ) 48.56 + goto good; 48.57 48.58 -#if 0 48.59 - /* Can't allow far jump to a Xen-private segment. */ 48.60 - if ( !VALID_CODESEL(a>>16) ) 48.61 - goto bad; 48.62 -#endif 48.63 + /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */ 48.64 + if ( (b & _SEGMENT_TYPE) == 0x000 ) 48.65 + goto good; 48.66 48.67 - /* Reserved bits must be zero. */ 48.68 - if ( (b & 0xe0) != 0 ) 48.69 - goto bad; 48.70 - 48.71 - /* No base/limit check is needed for a call gate. */ 48.72 - goto good; 48.73 - } 48.74 - 48.75 - /* Check that base is at least a page away from Xen-private area. */ 48.76 - base = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16); 48.77 - if ( base >= (PAGE_OFFSET - PAGE_SIZE) ) 48.78 + /* Everything but a call gate is discarded here. */ 48.79 + if ( (b & _SEGMENT_TYPE) != 0xc00 ) 48.80 goto bad; 48.81 48.82 - /* Check and truncate the limit if necessary. */ 48.83 - limit = (b&0xf0000) | (a&0xffff); 48.84 - limit++; /* We add one because limit is inclusive. */ 48.85 - if ( (b & _SEGMENT_G) ) 48.86 - limit <<= 12; 48.87 + /* Can't allow far jump to a Xen-private segment. */ 48.88 + if ( !VALID_CODESEL(a>>16) ) 48.89 + goto bad; 48.90 48.91 - if ( (b & (_SEGMENT_CODE | _SEGMENT_EC)) == _SEGMENT_EC ) 48.92 - { 48.93 - /* 48.94 - * Grows-down limit check. 48.95 - * NB. limit == 0xFFFFF provides no access (if G=1). 48.96 - * limit == 0x00000 provides 4GB-4kB access (if G=1). 48.97 - */ 48.98 - if ( (base + limit) > base ) 48.99 - { 48.100 - limit = -(base & PAGE_MASK); 48.101 - goto truncate; 48.102 - } 48.103 - } 48.104 - else 48.105 - { 48.106 - /* 48.107 - * Grows-up limit check. 48.108 - * NB. limit == 0xFFFFF provides 4GB access (if G=1). 48.109 - * limit == 0x00000 provides 4kB access (if G=1). 48.110 - */ 48.111 - if ( ((base + limit) <= base) || 48.112 - ((base + limit) > PAGE_OFFSET) ) 48.113 - { 48.114 - limit = PAGE_OFFSET - base; 48.115 - truncate: 48.116 - if ( !(b & _SEGMENT_G) ) 48.117 - goto bad; /* too dangerous; too hard to work out... */ 48.118 - limit = (limit >> 12) - 1; 48.119 - d[0] &= ~0x0ffff; d[0] |= limit & 0x0ffff; 48.120 - d[1] &= ~0xf0000; d[1] |= limit & 0xf0000; 48.121 - } 48.122 - } 48.123 - 48.124 + /* Reserved bits must be zero. */ 48.125 + if ( (b & 0xe0) != 0 ) 48.126 + goto bad; 48.127 + 48.128 good: 48.129 return 1; 48.130 bad: 48.131 @@ -340,159 +279,6 @@ int check_descriptor(unsigned long *d) 48.132 } 48.133 48.134 48.135 -void destroy_gdt(struct exec_domain *ed) 48.136 -{ 48.137 - int i; 48.138 - unsigned long pfn; 48.139 - 48.140 - for ( i = 0; i < 16; i++ ) 48.141 - { 48.142 - if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 ) 48.143 - put_page_and_type(&frame_table[pfn]); 48.144 - ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0); 48.145 - } 48.146 -} 48.147 - 48.148 - 48.149 -long set_gdt(struct exec_domain *ed, 48.150 - unsigned long *frames, 48.151 - unsigned int entries) 48.152 -{ 48.153 - struct domain *d = ed->domain; 48.154 - /* NB. There are 512 8-byte entries per GDT page. */ 48.155 - int i = 0, nr_pages = (entries + 511) / 512; 48.156 - struct desc_struct *vgdt; 48.157 - unsigned long pfn; 48.158 - 48.159 - /* Check the first page in the new GDT. */ 48.160 - if ( (pfn = frames[0]) >= max_page ) 48.161 - goto fail; 48.162 - 48.163 - /* The first page is special because Xen owns a range of entries in it. */ 48.164 - if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 48.165 - { 48.166 - /* GDT checks failed: try zapping the Xen reserved entries. */ 48.167 - if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) ) 48.168 - goto fail; 48.169 - vgdt = map_domain_mem(pfn << PAGE_SHIFT); 48.170 - memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0, 48.171 - NR_RESERVED_GDT_ENTRIES*8); 48.172 - unmap_domain_mem(vgdt); 48.173 - put_page_and_type(&frame_table[pfn]); 48.174 - 48.175 - /* Okay, we zapped the entries. Now try the GDT checks again. */ 48.176 - if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 48.177 - goto fail; 48.178 - } 48.179 - 48.180 - /* Check the remaining pages in the new GDT. */ 48.181 - for ( i = 1; i < nr_pages; i++ ) 48.182 - if ( ((pfn = frames[i]) >= max_page) || 48.183 - !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 48.184 - goto fail; 48.185 - 48.186 - /* Copy reserved GDT entries to the new GDT. */ 48.187 - vgdt = map_domain_mem(frames[0] << PAGE_SHIFT); 48.188 - memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 48.189 - gdt_table + FIRST_RESERVED_GDT_ENTRY, 48.190 - NR_RESERVED_GDT_ENTRIES*8); 48.191 - unmap_domain_mem(vgdt); 48.192 - 48.193 - /* Tear down the old GDT. */ 48.194 - destroy_gdt(ed); 48.195 - 48.196 - /* Install the new GDT. */ 48.197 - for ( i = 0; i < nr_pages; i++ ) 48.198 - ed->arch.perdomain_ptes[i] = 48.199 - mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); 48.200 - 48.201 - SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed)); 48.202 - SET_GDT_ENTRIES(ed, entries); 48.203 - 48.204 - return 0; 48.205 - 48.206 - fail: 48.207 - while ( i-- > 0 ) 48.208 - put_page_and_type(&frame_table[frames[i]]); 48.209 - return -EINVAL; 48.210 -} 48.211 - 48.212 - 48.213 -long do_set_gdt(unsigned long *frame_list, unsigned int entries) 48.214 -{ 48.215 - int nr_pages = (entries + 511) / 512; 48.216 - unsigned long frames[16]; 48.217 - long ret; 48.218 - 48.219 - if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 48.220 - return -EINVAL; 48.221 - 48.222 - if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) ) 48.223 - return -EFAULT; 48.224 - 48.225 - if ( (ret = set_gdt(current, frames, entries)) == 0 ) 48.226 - { 48.227 - local_flush_tlb(); 48.228 - __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt)); 48.229 - } 48.230 - 48.231 - return ret; 48.232 -} 48.233 - 48.234 - 48.235 -long do_update_descriptor( 48.236 - unsigned long pa, unsigned long word1, unsigned long word2) 48.237 -{ 48.238 - unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT, d[2]; 48.239 - struct pfn_info *page; 48.240 - long ret = -EINVAL; 48.241 - 48.242 - d[0] = word1; 48.243 - d[1] = word2; 48.244 - 48.245 - if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(d) ) 48.246 - return -EINVAL; 48.247 - 48.248 - page = &frame_table[pfn]; 48.249 - if ( unlikely(!get_page(page, current->domain)) ) 48.250 - return -EINVAL; 48.251 - 48.252 - /* Check if the given frame is in use in an unsafe context. */ 48.253 - switch ( page->u.inuse.type_info & PGT_type_mask ) 48.254 - { 48.255 - case PGT_gdt_page: 48.256 - /* Disallow updates of Xen-reserved descriptors in the current GDT. */ 48.257 - if ( (l1_pgentry_to_pagenr(current->arch.perdomain_ptes[0]) == pfn) && 48.258 - (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && 48.259 - (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) 48.260 - goto out; 48.261 - if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) 48.262 - goto out; 48.263 - break; 48.264 - case PGT_ldt_page: 48.265 - if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) 48.266 - goto out; 48.267 - break; 48.268 - default: 48.269 - if ( unlikely(!get_page_type(page, PGT_writable_page)) ) 48.270 - goto out; 48.271 - break; 48.272 - } 48.273 - 48.274 - /* All is good so make the update. */ 48.275 - gdt_pent = map_domain_mem(pa); 48.276 - memcpy(gdt_pent, d, 8); 48.277 - unmap_domain_mem(gdt_pent); 48.278 - 48.279 - put_page_type(page); 48.280 - 48.281 - ret = 0; /* success */ 48.282 - 48.283 - out: 48.284 - put_page(page); 48.285 - return ret; 48.286 -} 48.287 - 48.288 #ifdef MEMORY_GUARD 48.289 48.290 #define ALLOC_PT(_level) \
49.1 --- a/xen/arch/x86/x86_64/traps.c Mon Feb 07 08:19:24 2005 +0000 49.2 +++ b/xen/arch/x86/x86_64/traps.c Tue Feb 08 16:44:16 2005 +0000 49.3 @@ -129,10 +129,7 @@ void show_page_walk(unsigned long addr) 49.4 printk(" L1 = %p\n", page); 49.5 } 49.6 49.7 -#define DOUBLEFAULT_STACK_SIZE 1024 49.8 -static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE]; 49.9 asmlinkage void double_fault(void); 49.10 - 49.11 asmlinkage void do_double_fault(struct xen_regs *regs) 49.12 { 49.13 /* Disable the NMI watchdog. It's useless now. */ 49.14 @@ -142,19 +139,9 @@ asmlinkage void do_double_fault(struct x 49.15 49.16 /* Find information saved during fault and dump it to the console. */ 49.17 printk("************************************\n"); 49.18 - printk("EIP: %04lx:[<%p>] \nEFLAGS: %p\n", 49.19 - 0xffff & regs->cs, regs->rip, regs->eflags); 49.20 - printk("rax: %p rbx: %p rcx: %p rdx: %p\n", 49.21 - regs->rax, regs->rbx, regs->rcx, regs->rdx); 49.22 - printk("rsi: %p rdi: %p rbp: %p rsp: %p\n", 49.23 - regs->rsi, regs->rdi, regs->rbp, regs->rsp); 49.24 - printk("r8: %p r9: %p r10: %p r11: %p\n", 49.25 - regs->r8, regs->r9, regs->r10, regs->r11); 49.26 - printk("r12: %p r13: %p r14: %p r15: %p\n", 49.27 - regs->r12, regs->r13, regs->r14, regs->r15); 49.28 + show_registers(regs); 49.29 printk("************************************\n"); 49.30 - printk("CPU%d DOUBLE FAULT -- system shutdown\n", 49.31 - logical_smp_processor_id()); 49.32 + printk("CPU%d DOUBLE FAULT -- system shutdown\n", smp_processor_id()); 49.33 printk("System needs manual reset.\n"); 49.34 printk("************************************\n"); 49.35 49.36 @@ -166,25 +153,29 @@ asmlinkage void do_double_fault(struct x 49.37 __asm__ __volatile__ ( "hlt" ); 49.38 } 49.39 49.40 -void __init doublefault_init(void) 49.41 -{ 49.42 - int i; 49.43 - 49.44 - /* Initialise IST1 for each CPU. Note the handler is non-reentrant. */ 49.45 - for ( i = 0; i < NR_CPUS; i++ ) 49.46 - init_tss[i].ist[0] = (unsigned long) 49.47 - &doublefault_stack[DOUBLEFAULT_STACK_SIZE]; 49.48 - 49.49 - /* Set interrupt gate for double faults, specifying IST1. */ 49.50 - set_intr_gate(TRAP_double_fault, &double_fault); 49.51 - idt_table[TRAP_double_fault].a |= 1UL << 32; /* IST1 */ 49.52 -} 49.53 - 49.54 asmlinkage void hypercall(void); 49.55 void __init percpu_traps_init(void) 49.56 { 49.57 char *stack_top = (char *)get_stack_top(); 49.58 char *stack = (char *)((unsigned long)stack_top & ~(STACK_SIZE - 1)); 49.59 + int cpu = smp_processor_id(); 49.60 + 49.61 + /* Double-fault handler has its own per-CPU 1kB stack. */ 49.62 + init_tss[cpu].ist[0] = (unsigned long)&stack[1024]; 49.63 + set_intr_gate(TRAP_double_fault, &double_fault); 49.64 + idt_tables[cpu][TRAP_double_fault].a |= 1UL << 32; /* IST1 */ 49.65 + 49.66 + /* NMI handler has its own per-CPU 1kB stack. */ 49.67 + init_tss[cpu].ist[1] = (unsigned long)&stack[2048]; 49.68 + idt_tables[cpu][TRAP_nmi].a |= 2UL << 32; /* IST2 */ 49.69 + 49.70 + /* 49.71 + * Trampoline for SYSCALL entry from long mode. 49.72 + */ 49.73 + 49.74 + /* Skip the NMI and DF stacks. */ 49.75 + stack = &stack[2048]; 49.76 + wrmsr(MSR_LSTAR, (unsigned long)stack, ((unsigned long)stack>>32)); 49.77 49.78 /* movq %rsp, saversp(%rip) */ 49.79 stack[0] = 0x48; 49.80 @@ -202,9 +193,36 @@ void __init percpu_traps_init(void) 49.81 stack[14] = 0xe9; 49.82 *(u32 *)&stack[15] = (char *)hypercall - &stack[19]; 49.83 49.84 - wrmsr(MSR_STAR, 0, (FLAT_RING3_CS64<<16) | __HYPERVISOR_CS); 49.85 - wrmsr(MSR_LSTAR, (unsigned long)stack, ((unsigned long)stack>>32)); 49.86 - wrmsr(MSR_SYSCALL_MASK, 0xFFFFFFFFU, 0U); 49.87 + /* 49.88 + * Trampoline for SYSCALL entry from compatibility mode. 49.89 + */ 49.90 + 49.91 + /* Skip the long-mode entry trampoline. */ 49.92 + stack = &stack[19]; 49.93 + wrmsr(MSR_CSTAR, (unsigned long)stack, ((unsigned long)stack>>32)); 49.94 + 49.95 + /* movq %rsp, saversp(%rip) */ 49.96 + stack[0] = 0x48; 49.97 + stack[1] = 0x89; 49.98 + stack[2] = 0x25; 49.99 + *(u32 *)&stack[3] = (stack_top - &stack[7]) - 16; 49.100 + 49.101 + /* leaq saversp(%rip), %rsp */ 49.102 + stack[7] = 0x48; 49.103 + stack[8] = 0x8d; 49.104 + stack[9] = 0x25; 49.105 + *(u32 *)&stack[10] = (stack_top - &stack[14]) - 16; 49.106 + 49.107 + /* jmp hypercall */ 49.108 + stack[14] = 0xe9; 49.109 + *(u32 *)&stack[15] = (char *)hypercall - &stack[19]; 49.110 + 49.111 + /* 49.112 + * Common SYSCALL parameters. 49.113 + */ 49.114 + 49.115 + wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS); 49.116 + wrmsr(MSR_SYSCALL_MASK, ~EF_IE, 0U); /* disable interrupts */ 49.117 } 49.118 49.119 void *decode_reg(struct xen_regs *regs, u8 b)
50.1 --- a/xen/common/dom_mem_ops.c Mon Feb 07 08:19:24 2005 +0000 50.2 +++ b/xen/common/dom_mem_ops.c Tue Feb 08 16:44:16 2005 +0000 50.3 @@ -25,8 +25,8 @@ 50.4 50.5 #define PREEMPT_CHECK(_op) \ 50.6 if ( hypercall_preempt_check() ) \ 50.7 - return hypercall_create_continuation( \ 50.8 - __HYPERVISOR_dom_mem_op, 5, \ 50.9 + return hypercall5_create_continuation( \ 50.10 + __HYPERVISOR_dom_mem_op, \ 50.11 (_op) | (i << START_EXTENT_SHIFT), \ 50.12 extent_list, nr_extents, extent_order, \ 50.13 (d == current->domain) ? DOMID_SELF : d->id); 50.14 @@ -122,7 +122,7 @@ free_dom_mem(struct domain *d, 50.15 long 50.16 do_dom_mem_op(unsigned long op, 50.17 unsigned long *extent_list, 50.18 - unsigned long nr_extents, 50.19 + unsigned int nr_extents, 50.20 unsigned int extent_order, 50.21 domid_t domid) 50.22 { 50.23 @@ -133,8 +133,7 @@ do_dom_mem_op(unsigned long op, 50.24 start_extent = op >> START_EXTENT_SHIFT; 50.25 op &= (1 << START_EXTENT_SHIFT) - 1; 50.26 50.27 - if ( unlikely(start_extent > nr_extents) || 50.28 - unlikely(nr_extents > ~0U) ) /* can pack into a uint? */ 50.29 + if ( unlikely(start_extent > nr_extents) ) 50.30 return -EINVAL; 50.31 50.32 if ( likely(domid == DOMID_SELF) ) 50.33 @@ -150,13 +149,11 @@ do_dom_mem_op(unsigned long op, 50.34 { 50.35 case MEMOP_increase_reservation: 50.36 rc = alloc_dom_mem( 50.37 - d, extent_list, start_extent, 50.38 - (unsigned int)nr_extents, extent_order); 50.39 + d, extent_list, start_extent, nr_extents, extent_order); 50.40 break; 50.41 case MEMOP_decrease_reservation: 50.42 rc = free_dom_mem( 50.43 - d, extent_list, start_extent, 50.44 - (unsigned int)nr_extents, extent_order); 50.45 + d, extent_list, start_extent, nr_extents, extent_order); 50.46 break; 50.47 default: 50.48 rc = -ENOSYS;
51.1 --- a/xen/common/domain.c Mon Feb 07 08:19:24 2005 +0000 51.2 +++ b/xen/common/domain.c Tue Feb 08 16:44:16 2005 +0000 51.3 @@ -45,8 +45,6 @@ struct domain *do_createdomain(domid_t d 51.4 ed->processor = cpu; 51.5 d->create_time = NOW(); 51.6 51.7 - memcpy(&ed->arch, &idle0_exec_domain.arch, sizeof(ed->arch)); 51.8 - 51.9 spin_lock_init(&d->time_lock); 51.10 51.11 spin_lock_init(&d->big_lock);
52.1 --- a/xen/common/elf.c Mon Feb 07 08:19:24 2005 +0000 52.2 +++ b/xen/common/elf.c Tue Feb 08 16:44:16 2005 +0000 52.3 @@ -13,10 +13,8 @@ 52.4 52.5 #ifdef CONFIG_X86 52.6 #define FORCE_XENELF_IMAGE 1 52.7 -#define ELF_ADDR p_vaddr 52.8 #elif defined(__ia64__) 52.9 #define FORCE_XENELF_IMAGE 0 52.10 -#define ELF_ADDR p_paddr 52.11 #endif 52.12 52.13 static inline int is_loadable_phdr(Elf_Phdr *phdr) 52.14 @@ -100,10 +98,10 @@ int parseelfimage(char *elfbase, 52.15 phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize)); 52.16 if ( !is_loadable_phdr(phdr) ) 52.17 continue; 52.18 - if ( phdr->ELF_ADDR < kernstart ) 52.19 - kernstart = phdr->ELF_ADDR; 52.20 - if ( (phdr->ELF_ADDR + phdr->p_memsz) > kernend ) 52.21 - kernend = phdr->ELF_ADDR + phdr->p_memsz; 52.22 + if ( phdr->p_paddr < kernstart ) 52.23 + kernstart = phdr->p_paddr; 52.24 + if ( (phdr->p_paddr + phdr->p_memsz) > kernend ) 52.25 + kernend = phdr->p_paddr + phdr->p_memsz; 52.26 } 52.27 52.28 if ( (kernstart > kernend) || 52.29 @@ -144,10 +142,10 @@ int loadelfimage(char *elfbase) 52.30 if ( !is_loadable_phdr(phdr) ) 52.31 continue; 52.32 if ( phdr->p_filesz != 0 ) 52.33 - memcpy((char *)phdr->ELF_ADDR, elfbase + phdr->p_offset, 52.34 + memcpy((char *)phdr->p_paddr, elfbase + phdr->p_offset, 52.35 phdr->p_filesz); 52.36 if ( phdr->p_memsz > phdr->p_filesz ) 52.37 - memset((char *)phdr->ELF_ADDR + phdr->p_filesz, 0, 52.38 + memset((char *)phdr->p_paddr + phdr->p_filesz, 0, 52.39 phdr->p_memsz - phdr->p_filesz); 52.40 } 52.41
53.1 --- a/xen/common/keyhandler.c Mon Feb 07 08:19:24 2005 +0000 53.2 +++ b/xen/common/keyhandler.c Tue Feb 08 16:44:16 2005 +0000 53.3 @@ -27,7 +27,7 @@ static struct { 53.4 53.5 static unsigned char keypress_key; 53.6 53.7 -void keypress_softirq(void) 53.8 +static void keypress_softirq(void) 53.9 { 53.10 keyhandler_t *h; 53.11 unsigned char key = keypress_key; 53.12 @@ -94,7 +94,7 @@ static void halt_machine(unsigned char k 53.13 machine_restart(NULL); 53.14 } 53.15 53.16 -void do_task_queues(unsigned char key) 53.17 +static void do_task_queues(unsigned char key) 53.18 { 53.19 struct domain *d; 53.20 struct exec_domain *ed;
54.1 --- a/xen/common/multicall.c Mon Feb 07 08:19:24 2005 +0000 54.2 +++ b/xen/common/multicall.c Tue Feb 08 16:44:16 2005 +0000 54.3 @@ -67,8 +67,8 @@ long do_multicall(multicall_entry_t *cal 54.4 if ( i < nr_calls ) 54.5 { 54.6 mcs->flags = 0; 54.7 - return hypercall_create_continuation( 54.8 - __HYPERVISOR_multicall, 2, &call_list[i], nr_calls-i); 54.9 + return hypercall2_create_continuation( 54.10 + __HYPERVISOR_multicall, &call_list[i], nr_calls-i); 54.11 } 54.12 } 54.13 }
55.1 --- a/xen/common/physdev.c Mon Feb 07 08:19:24 2005 +0000 55.2 +++ b/xen/common/physdev.c Tue Feb 08 16:44:16 2005 +0000 55.3 @@ -720,7 +720,7 @@ string_param("physdev_dom0_hide", opt_ph 55.4 55.5 /* Test if boot params specify this device should NOT be visible to DOM0 55.6 * (e.g. so that another domain can control it instead) */ 55.7 -int pcidev_dom0_hidden(struct pci_dev *dev) 55.8 +static int pcidev_dom0_hidden(struct pci_dev *dev) 55.9 { 55.10 char cmp[10] = "(.......)"; 55.11
56.1 --- a/xen/common/resource.c Mon Feb 07 08:19:24 2005 +0000 56.2 +++ b/xen/common/resource.c Tue Feb 08 16:44:16 2005 +0000 56.3 @@ -254,19 +254,6 @@ struct resource * __request_region(struc 56.4 return res; 56.5 } 56.6 56.7 -int __check_region(struct resource *parent, unsigned long start, unsigned long n) 56.8 -{ 56.9 - struct resource * res; 56.10 - 56.11 - res = __request_region(parent, start, n, "check-region"); 56.12 - if (!res) 56.13 - return -EBUSY; 56.14 - 56.15 - release_resource(res); 56.16 - xfree(res); 56.17 - return 0; 56.18 -} 56.19 - 56.20 void __release_region(struct resource *parent, unsigned long start, unsigned long n) 56.21 { 56.22 struct resource **p;
57.1 --- a/xen/common/sched_bvt.c Mon Feb 07 08:19:24 2005 +0000 57.2 +++ b/xen/common/sched_bvt.c Tue Feb 08 16:44:16 2005 +0000 57.3 @@ -167,7 +167,7 @@ static inline u32 calc_evt(struct exec_d 57.4 * 57.5 * Returns non-zero on failure. 57.6 */ 57.7 -int bvt_alloc_task(struct exec_domain *ed) 57.8 +static int bvt_alloc_task(struct exec_domain *ed) 57.9 { 57.10 struct domain *d = ed->domain; 57.11 if ( (d->sched_priv == NULL) ) { 57.12 @@ -184,7 +184,7 @@ int bvt_alloc_task(struct exec_domain *e 57.13 /* 57.14 * Add and remove a domain 57.15 */ 57.16 -void bvt_add_task(struct exec_domain *d) 57.17 +static void bvt_add_task(struct exec_domain *d) 57.18 { 57.19 struct bvt_dom_info *inf = BVT_INFO(d->domain); 57.20 struct bvt_edom_info *einf = EBVT_INFO(d); 57.21 @@ -225,7 +225,7 @@ void bvt_add_task(struct exec_domain *d) 57.22 } 57.23 } 57.24 57.25 -int bvt_init_idle_task(struct exec_domain *p) 57.26 +static int bvt_init_idle_task(struct exec_domain *p) 57.27 { 57.28 if ( bvt_alloc_task(p) < 0 ) 57.29 return -1; 57.30 @@ -239,7 +239,7 @@ int bvt_init_idle_task(struct exec_domai 57.31 return 0; 57.32 } 57.33 57.34 -void bvt_wake(struct exec_domain *d) 57.35 +static void bvt_wake(struct exec_domain *d) 57.36 { 57.37 struct bvt_edom_info *einf = EBVT_INFO(d); 57.38 struct exec_domain *curr; 57.39 @@ -290,14 +290,14 @@ static void bvt_sleep(struct exec_domain 57.40 * bvt_free_task - free BVT private structures for a task 57.41 * @d: task 57.42 */ 57.43 -void bvt_free_task(struct domain *d) 57.44 +static void bvt_free_task(struct domain *d) 57.45 { 57.46 ASSERT(d->sched_priv != NULL); 57.47 xfree(d->sched_priv); 57.48 } 57.49 57.50 /* Control the scheduler. */ 57.51 -int bvt_ctl(struct sched_ctl_cmd *cmd) 57.52 +static int bvt_ctl(struct sched_ctl_cmd *cmd) 57.53 { 57.54 struct bvt_ctl *params = &cmd->u.bvt; 57.55 57.56 @@ -310,7 +310,7 @@ int bvt_ctl(struct sched_ctl_cmd *cmd) 57.57 } 57.58 57.59 /* Adjust scheduling parameter for a given domain. */ 57.60 -int bvt_adjdom( 57.61 +static int bvt_adjdom( 57.62 struct domain *d, struct sched_adjdom_cmd *cmd) 57.63 { 57.64 struct bvt_adjdom *params = &cmd->u.bvt; 57.65 @@ -549,7 +549,7 @@ static void bvt_dump_cpu_state(int i) 57.66 } 57.67 57.68 /* Initialise the data structures. */ 57.69 -int bvt_init_scheduler() 57.70 +static int bvt_init_scheduler(void) 57.71 { 57.72 int i; 57.73
58.1 --- a/xen/drivers/pci/Makefile Mon Feb 07 08:19:24 2005 +0000 58.2 +++ b/xen/drivers/pci/Makefile Tue Feb 08 16:44:16 2005 +0000 58.3 @@ -4,7 +4,7 @@ 58.4 58.5 include $(BASEDIR)/Rules.mk 58.6 58.7 -OBJS := pci.o quirks.o compat.o names.o setup-res.o 58.8 +OBJS := pci.o quirks.o names.o setup-res.o 58.9 58.10 #obj-$(CONFIG_PCI) += pci.o quirks.o compat.o names.o 58.11 #obj-$(CONFIG_PROC_FS) += proc.o
59.1 --- a/xen/drivers/pci/compat.c Mon Feb 07 08:19:24 2005 +0000 59.2 +++ b/xen/drivers/pci/compat.c Tue Feb 08 16:44:16 2005 +0000 59.3 @@ -1,65 +0,0 @@ 59.4 -/* 59.5 - * $Id: compat.c,v 1.1 1998/02/16 10:35:50 mj Exp $ 59.6 - * 59.7 - * PCI Bus Services -- Function For Backward Compatibility 59.8 - * 59.9 - * Copyright 1998--2000 Martin Mares <mj@ucw.cz> 59.10 - */ 59.11 - 59.12 -#include <xen/types.h> 59.13 -//#include <xen/kernel.h> 59.14 -#include <xen/pci.h> 59.15 - 59.16 -int 59.17 -pcibios_present(void) 59.18 -{ 59.19 - return !list_empty(&pci_devices); 59.20 -} 59.21 - 59.22 -int 59.23 -pcibios_find_class(unsigned int class, unsigned short index, unsigned char *bus, unsigned char *devfn) 59.24 -{ 59.25 - const struct pci_dev *dev = NULL; 59.26 - int cnt = 0; 59.27 - 59.28 - while ((dev = pci_find_class(class, dev))) 59.29 - if (index == cnt++) { 59.30 - *bus = dev->bus->number; 59.31 - *devfn = dev->devfn; 59.32 - return PCIBIOS_SUCCESSFUL; 59.33 - } 59.34 - return PCIBIOS_DEVICE_NOT_FOUND; 59.35 -} 59.36 - 59.37 - 59.38 -int 59.39 -pcibios_find_device(unsigned short vendor, unsigned short device, unsigned short index, 59.40 - unsigned char *bus, unsigned char *devfn) 59.41 -{ 59.42 - const struct pci_dev *dev = NULL; 59.43 - int cnt = 0; 59.44 - 59.45 - while ((dev = pci_find_device(vendor, device, dev))) 59.46 - if (index == cnt++) { 59.47 - *bus = dev->bus->number; 59.48 - *devfn = dev->devfn; 59.49 - return PCIBIOS_SUCCESSFUL; 59.50 - } 59.51 - return PCIBIOS_DEVICE_NOT_FOUND; 59.52 -} 59.53 - 59.54 -#define PCI_OP(rw,size,type) \ 59.55 -int pcibios_##rw##_config_##size (unsigned char bus, unsigned char dev_fn, \ 59.56 - unsigned char where, unsigned type val) \ 59.57 -{ \ 59.58 - struct pci_dev *dev = pci_find_slot(bus, dev_fn); \ 59.59 - if (!dev) return PCIBIOS_DEVICE_NOT_FOUND; \ 59.60 - return pci_##rw##_config_##size(dev, where, val); \ 59.61 -} 59.62 - 59.63 -PCI_OP(read, byte, char *) 59.64 -PCI_OP(read, word, short *) 59.65 -PCI_OP(read, dword, int *) 59.66 -PCI_OP(write, byte, char) 59.67 -PCI_OP(write, word, short) 59.68 -PCI_OP(write, dword, int)
60.1 --- a/xen/include/asm-x86/config.h Mon Feb 07 08:19:24 2005 +0000 60.2 +++ b/xen/include/asm-x86/config.h Tue Feb 08 16:44:16 2005 +0000 60.3 @@ -191,6 +191,10 @@ extern void __out_of_line_bug(int line) 60.4 #define __HYPERVISOR_DS32 0x0818 60.5 #define __HYPERVISOR_DS __HYPERVISOR_DS64 60.6 60.7 +#define __GUEST_CS 0x0833 60.8 +#define __GUEST_DS 0x0000 60.9 +#define __GUEST_SS 0x082b 60.10 + 60.11 /* For generic assembly code: use macros to define operation/operand sizes. */ 60.12 #define __OS "q" /* Operation Suffix */ 60.13 #define __OP "r" /* Operand Prefix */
61.1 --- a/xen/include/asm-x86/domain.h Mon Feb 07 08:19:24 2005 +0000 61.2 +++ b/xen/include/asm-x86/domain.h Tue Feb 08 16:44:16 2005 +0000 61.3 @@ -96,6 +96,7 @@ struct arch_exec_domain 61.4 pagetable_t pagetable; 61.5 61.6 pagetable_t monitor_table; 61.7 + pagetable_t phys_table; /* 1:1 pagetable */ 61.8 pagetable_t shadow_table; 61.9 l2_pgentry_t *vpagetable; /* virtual address of pagetable */ 61.10 l2_pgentry_t *shadow_vtable; /* virtual address of shadow_table */
62.1 --- a/xen/include/asm-x86/mm.h Mon Feb 07 08:19:24 2005 +0000 62.2 +++ b/xen/include/asm-x86/mm.h Tue Feb 08 16:44:16 2005 +0000 62.3 @@ -13,6 +13,7 @@ 62.4 #include <asm/desc.h> 62.5 #include <asm/flushtlb.h> 62.6 #include <asm/io.h> 62.7 +#include <asm/uaccess.h> 62.8 62.9 #include <public/xen.h> 62.10 62.11 @@ -218,7 +219,7 @@ static inline int get_page_and_type(stru 62.12 ASSERT(((_p)->count_info & PGC_count_mask) != 0); \ 62.13 ASSERT(page_get_owner(_p) == (_d)) 62.14 62.15 -int check_descriptor(unsigned long *d); 62.16 +int check_descriptor(struct desc_struct *d); 62.17 62.18 /* 62.19 * Use currently-executing domain's pagetables on the specified CPUs. 62.20 @@ -241,8 +242,20 @@ void synchronise_pagetables(unsigned lon 62.21 #undef phys_to_machine_mapping 62.22 62.23 #define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START) 62.24 -#define phys_to_machine_mapping ((unsigned long *)PERDOMAIN_VIRT_START) 62.25 +#define __phys_to_machine_mapping ((unsigned long *)PERDOMAIN_VIRT_START) 62.26 +/* Returns the machine physical */ 62.27 +static inline unsigned long phys_to_machine_mapping(unsigned long pfn) 62.28 +{ 62.29 + unsigned long mfn; 62.30 + l1_pgentry_t pte; 62.31 62.32 + if (__get_user(l1_pgentry_val(pte), (__phys_to_machine_mapping + pfn))) { 62.33 + return 0; 62.34 + } 62.35 + 62.36 + mfn = l1_pgentry_to_phys(pte) >> PAGE_SHIFT; 62.37 + return mfn; 62.38 +} 62.39 #define set_machinetophys(_mfn, _pfn) machine_to_phys_mapping[(_mfn)] = (_pfn) 62.40 62.41 #define DEFAULT_GDT_ENTRIES (LAST_RESERVED_GDT_ENTRY+1)
63.1 --- a/xen/include/asm-x86/multicall.h Mon Feb 07 08:19:24 2005 +0000 63.2 +++ b/xen/include/asm-x86/multicall.h Tue Feb 08 16:44:16 2005 +0000 63.3 @@ -9,7 +9,23 @@ 63.4 63.5 #ifdef __x86_64__ 63.6 63.7 -#define do_multicall_call(_call) BUG() 63.8 +#define do_multicall_call(_call) \ 63.9 + do { \ 63.10 + __asm__ __volatile__ ( \ 63.11 + "movq "STR(MULTICALL_op)"(%0),%%rax; " \ 63.12 + "andq $("STR(NR_hypercalls)"-1),%%rax; " \ 63.13 + "leaq "STR(hypercall_table)"(%%rip),%%rdi; "\ 63.14 + "leaq (%%rdi,%%rax,8),%%rax; " \ 63.15 + "movq "STR(MULTICALL_arg0)"(%0),%%rdi; " \ 63.16 + "movq "STR(MULTICALL_arg1)"(%0),%%rsi; " \ 63.17 + "movq "STR(MULTICALL_arg2)"(%0),%%rdx; " \ 63.18 + "movq "STR(MULTICALL_arg3)"(%0),%%rcx; " \ 63.19 + "movq "STR(MULTICALL_arg4)"(%0),%%r8; " \ 63.20 + "callq *(%%rax); " \ 63.21 + "movq %%rax,"STR(MULTICALL_result)"(%0); " \ 63.22 + : : "b" (_call) \ 63.23 + : "rax", "rdi", "rsi", "rdx", "rcx", "r8" ); \ 63.24 + } while ( 0 ) 63.25 63.26 #else 63.27
64.1 --- a/xen/include/asm-x86/page.h Mon Feb 07 08:19:24 2005 +0000 64.2 +++ b/xen/include/asm-x86/page.h Tue Feb 08 16:44:16 2005 +0000 64.3 @@ -1,39 +1,14 @@ 64.4 -/****************************************************************************** 64.5 - * asm-x86/page.h 64.6 - * 64.7 - * Definitions relating to page tables. 64.8 - */ 64.9 +/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ 64.10 64.11 #ifndef __X86_PAGE_H__ 64.12 #define __X86_PAGE_H__ 64.13 64.14 -#if defined(__x86_64__) 64.15 - 64.16 -#define L1_PAGETABLE_SHIFT 12 64.17 -#define L2_PAGETABLE_SHIFT 21 64.18 -#define L3_PAGETABLE_SHIFT 30 64.19 -#define L4_PAGETABLE_SHIFT 39 64.20 - 64.21 -#define ENTRIES_PER_L1_PAGETABLE 512 64.22 -#define ENTRIES_PER_L2_PAGETABLE 512 64.23 -#define ENTRIES_PER_L3_PAGETABLE 512 64.24 -#define ENTRIES_PER_L4_PAGETABLE 512 64.25 - 64.26 -#define __PAGE_OFFSET (0xFFFF830000000000) 64.27 - 64.28 -#elif defined(__i386__) 64.29 - 64.30 -#define L1_PAGETABLE_SHIFT 12 64.31 -#define L2_PAGETABLE_SHIFT 22 64.32 - 64.33 -#define ENTRIES_PER_L1_PAGETABLE 1024 64.34 -#define ENTRIES_PER_L2_PAGETABLE 1024 64.35 - 64.36 -#define __PAGE_OFFSET (0xFC400000) 64.37 - 64.38 +#if defined(__i386__) 64.39 +#include <asm/x86_32/page.h> 64.40 +#elif defined(__x86_64__) 64.41 +#include <asm/x86_64/page.h> 64.42 #endif 64.43 64.44 -#define PAGE_SHIFT L1_PAGETABLE_SHIFT 64.45 #ifndef __ASSEMBLY__ 64.46 #define PAGE_SIZE (1UL << PAGE_SHIFT) 64.47 #else 64.48 @@ -44,77 +19,9 @@ 64.49 #define clear_page(_p) memset((void *)(_p), 0, PAGE_SIZE) 64.50 #define copy_page(_t,_f) memcpy((void *)(_t), (void *)(_f), PAGE_SIZE) 64.51 64.52 -#ifndef __ASSEMBLY__ 64.53 -#include <xen/config.h> 64.54 -typedef struct { unsigned long l1_lo; } l1_pgentry_t; 64.55 -typedef struct { unsigned long l2_lo; } l2_pgentry_t; 64.56 -typedef struct { unsigned long l3_lo; } l3_pgentry_t; 64.57 -typedef struct { unsigned long l4_lo; } l4_pgentry_t; 64.58 -#endif /* !__ASSEMBLY__ */ 64.59 - 64.60 -/* Strip type from a table entry. */ 64.61 -#define l1_pgentry_val(_x) ((_x).l1_lo) 64.62 -#define l2_pgentry_val(_x) ((_x).l2_lo) 64.63 -#define l3_pgentry_val(_x) ((_x).l3_lo) 64.64 -#define l4_pgentry_val(_x) ((_x).l4_lo) 64.65 - 64.66 -/* Add type to a table entry. */ 64.67 -#define mk_l1_pgentry(_x) ( (l1_pgentry_t) { (_x) } ) 64.68 -#define mk_l2_pgentry(_x) ( (l2_pgentry_t) { (_x) } ) 64.69 -#define mk_l3_pgentry(_x) ( (l3_pgentry_t) { (_x) } ) 64.70 -#define mk_l4_pgentry(_x) ( (l4_pgentry_t) { (_x) } ) 64.71 - 64.72 -/* Turn a typed table entry into a page index. */ 64.73 -#define l1_pgentry_to_pagenr(_x) (l1_pgentry_val(_x) >> PAGE_SHIFT) 64.74 -#define l2_pgentry_to_pagenr(_x) (l2_pgentry_val(_x) >> PAGE_SHIFT) 64.75 -#define l3_pgentry_to_pagenr(_x) (l3_pgentry_val(_x) >> PAGE_SHIFT) 64.76 -#define l4_pgentry_to_pagenr(_x) (l4_pgentry_val(_x) >> PAGE_SHIFT) 64.77 - 64.78 -/* Turn a typed table entry into a physical address. */ 64.79 -#define l1_pgentry_to_phys(_x) (l1_pgentry_val(_x) & PAGE_MASK) 64.80 -#define l2_pgentry_to_phys(_x) (l2_pgentry_val(_x) & PAGE_MASK) 64.81 -#define l3_pgentry_to_phys(_x) (l3_pgentry_val(_x) & PAGE_MASK) 64.82 -#define l4_pgentry_to_phys(_x) (l4_pgentry_val(_x) & PAGE_MASK) 64.83 - 64.84 -/* Pagetable walking. */ 64.85 -#define l2_pgentry_to_l1(_x) \ 64.86 - ((l1_pgentry_t *)__va(l2_pgentry_val(_x) & PAGE_MASK)) 64.87 -#define l3_pgentry_to_l2(_x) \ 64.88 - ((l2_pgentry_t *)__va(l3_pgentry_val(_x) & PAGE_MASK)) 64.89 -#define l4_pgentry_to_l3(_x) \ 64.90 - ((l3_pgentry_t *)__va(l4_pgentry_val(_x) & PAGE_MASK)) 64.91 - 64.92 -/* Given a virtual address, get an entry offset into a page table. */ 64.93 -#define l1_table_offset(_a) \ 64.94 - (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1)) 64.95 -#if defined(__i386__) 64.96 -#define l2_table_offset(_a) \ 64.97 - ((_a) >> L2_PAGETABLE_SHIFT) 64.98 -#elif defined(__x86_64__) 64.99 -#define l2_table_offset(_a) \ 64.100 - (((_a) >> L2_PAGETABLE_SHIFT) & (ENTRIES_PER_L2_PAGETABLE - 1)) 64.101 -#define l3_table_offset(_a) \ 64.102 - (((_a) >> L3_PAGETABLE_SHIFT) & (ENTRIES_PER_L3_PAGETABLE - 1)) 64.103 -#define l4_table_offset(_a) \ 64.104 - (((_a) >> L4_PAGETABLE_SHIFT) & (ENTRIES_PER_L4_PAGETABLE - 1)) 64.105 -#endif 64.106 - 64.107 -#if defined(__i386__) 64.108 -#define pagetable_t l2_pgentry_t 64.109 -#define pagetable_val(_x) ((_x).l2_lo) 64.110 -#define mk_pagetable(_x) ( (l2_pgentry_t) { (_x) } ) 64.111 -#define ENTRIES_PER_PAGETABLE ENTRIES_PER_L2_PAGETABLE 64.112 -#elif defined(__x86_64__) 64.113 -#define pagetable_t l4_pgentry_t 64.114 -#define pagetable_val(_x) ((_x).l4_lo) 64.115 -#define mk_pagetable(_x) ( (l4_pgentry_t) { (_x) } ) 64.116 -#define ENTRIES_PER_PAGETABLE ENTRIES_PER_L4_PAGETABLE 64.117 -#endif 64.118 - 64.119 #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) 64.120 #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) 64.121 #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) 64.122 -#define page_address(_p) (__va(((_p) - frame_table) << PAGE_SHIFT)) 64.123 #define pfn_to_page(_pfn) (frame_table + (_pfn)) 64.124 #define phys_to_page(kaddr) (frame_table + ((kaddr) >> PAGE_SHIFT)) 64.125 #define virt_to_page(kaddr) (frame_table + (__pa(kaddr) >> PAGE_SHIFT))
65.1 --- a/xen/include/asm-x86/shadow.h Mon Feb 07 08:19:24 2005 +0000 65.2 +++ b/xen/include/asm-x86/shadow.h Tue Feb 08 16:44:16 2005 +0000 65.3 @@ -8,6 +8,10 @@ 65.4 #include <xen/perfc.h> 65.5 #include <asm/processor.h> 65.6 65.7 +#ifdef CONFIG_VMX 65.8 +#include <asm/domain_page.h> 65.9 +#endif 65.10 + 65.11 /* Shadow PT flag bits in pfn_info */ 65.12 #define PSH_shadowed (1<<31) /* page has a shadow. PFN points to shadow */ 65.13 #define PSH_pfn_mask ((1<<21)-1) 65.14 @@ -34,7 +38,7 @@ extern int shadow_fault(unsigned long va 65.15 extern void shadow_l1_normal_pt_update( 65.16 unsigned long pa, unsigned long gpte, 65.17 unsigned long *prev_spfn_ptr, l1_pgentry_t **prev_spl1e_ptr); 65.18 -extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpte); 65.19 +extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde); 65.20 extern void unshadow_table(unsigned long gpfn, unsigned int type); 65.21 extern int shadow_mode_enable(struct domain *p, unsigned int mode); 65.22 65.23 @@ -43,17 +47,15 @@ extern void vmx_shadow_clear_state(struc 65.24 extern void vmx_shadow_invlpg(struct domain *, unsigned long); 65.25 #endif 65.26 65.27 -#define __get_machine_to_phys(_d, guest_gpfn, gpfn) \ 65.28 - if ((_d)->arch.shadow_mode == SHM_full_32) \ 65.29 - (guest_gpfn) = machine_to_phys_mapping[(gpfn)]; \ 65.30 - else \ 65.31 - (guest_gpfn) = (gpfn); 65.32 +#define __mfn_to_gpfn(_d, mfn) \ 65.33 + ( (shadow_mode(_d) == SHM_full_32) \ 65.34 + ? machine_to_phys_mapping[(mfn)] \ 65.35 + : (mfn) ) 65.36 65.37 -#define __get_phys_to_machine(_d, host_gpfn, gpfn) \ 65.38 - if ((_d)->arch.shadow_mode == SHM_full_32) \ 65.39 - (host_gpfn) = phys_to_machine_mapping[(gpfn)]; \ 65.40 - else \ 65.41 - (host_gpfn) = (gpfn); 65.42 +#define __gpfn_to_mfn(_d, gpfn) \ 65.43 + ( (shadow_mode(_d) == SHM_full_32) \ 65.44 + ? phys_to_machine_mapping(gpfn) \ 65.45 + : (gpfn) ) 65.46 65.47 extern void __shadow_mode_disable(struct domain *d); 65.48 static inline void shadow_mode_disable(struct domain *d) 65.49 @@ -66,17 +68,18 @@ extern unsigned long shadow_l2_table( 65.50 struct domain *d, unsigned long gpfn); 65.51 65.52 static inline void shadow_invalidate(struct exec_domain *ed) { 65.53 - if ( ed->domain->arch.shadow_mode != SHM_full_32 ) 65.54 + if ( shadow_mode(ed->domain) != SHM_full_32 ) 65.55 BUG(); 65.56 memset(ed->arch.shadow_vtable, 0, PAGE_SIZE); 65.57 } 65.58 65.59 #define SHADOW_DEBUG 1 65.60 +#define SHADOW_VERBOSE_DEBUG 0 65.61 #define SHADOW_HASH_DEBUG 1 65.62 65.63 struct shadow_status { 65.64 unsigned long pfn; /* Guest pfn. */ 65.65 - unsigned long spfn_and_flags; /* Shadow pfn plus flags. */ 65.66 + unsigned long smfn_and_flags; /* Shadow mfn plus flags. */ 65.67 struct shadow_status *next; /* Pull-to-front list. */ 65.68 }; 65.69 65.70 @@ -84,62 +87,72 @@ struct shadow_status { 65.71 #define shadow_ht_buckets 256 65.72 65.73 #ifdef VERBOSE 65.74 -#define SH_LOG(_f, _a...) \ 65.75 -printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \ 65.76 - current->domain->id , __LINE__ , ## _a ) 65.77 +#define SH_LOG(_f, _a...) \ 65.78 +printk("DOM%uP%u: (file=shadow.c, line=%d) " _f "\n", \ 65.79 + current->domain->id , current->processor, __LINE__ , ## _a ) 65.80 #else 65.81 #define SH_LOG(_f, _a...) 65.82 #endif 65.83 65.84 #if SHADOW_DEBUG 65.85 -#define SH_VLOG(_f, _a...) \ 65.86 - printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \ 65.87 - current->domain->id , __LINE__ , ## _a ) 65.88 +#define SH_VLOG(_f, _a...) \ 65.89 + printk("DOM%uP%u: (file=shadow.c, line=%d) " _f "\n", \ 65.90 + current->domain->id, current->processor, __LINE__ , ## _a ) 65.91 #else 65.92 #define SH_VLOG(_f, _a...) 65.93 #endif 65.94 65.95 -#if 0 65.96 -#define SH_VVLOG(_f, _a...) \ 65.97 - printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \ 65.98 - current->domain->id , __LINE__ , ## _a ) 65.99 +#if SHADOW_VERBOSE_DEBUG 65.100 +#define SH_VVLOG(_f, _a...) \ 65.101 + printk("DOM%uP%u: (file=shadow.c, line=%d) " _f "\n", \ 65.102 + current->domain->id, current->processor, __LINE__ , ## _a ) 65.103 #else 65.104 #define SH_VVLOG(_f, _a...) 65.105 #endif 65.106 65.107 -static inline void __shadow_get_pl2e( 65.108 +// BUG: mafetter: this assumes ed == current, so why pass ed? 65.109 +static inline void __shadow_get_l2e( 65.110 struct exec_domain *ed, unsigned long va, unsigned long *sl2e) 65.111 { 65.112 - *sl2e = (ed->domain->arch.shadow_mode == SHM_full_32) ? 65.113 - l2_pgentry_val(ed->arch.shadow_vtable[l2_table_offset(va)]) : 65.114 - l2_pgentry_val(linear_l2_table[l2_table_offset(va)]); 65.115 + if ( shadow_mode(ed->domain) == SHM_full_32 ) { 65.116 + *sl2e = l2_pgentry_val(ed->arch.shadow_vtable[l2_table_offset(va)]); 65.117 + } 65.118 + else if ( shadow_mode(ed->domain) ) { 65.119 + *sl2e = l2_pgentry_val(shadow_linear_l2_table[l2_table_offset(va)]); 65.120 + } 65.121 + else 65.122 + *sl2e = l2_pgentry_val(linear_l2_table[l2_table_offset(va)]); 65.123 } 65.124 65.125 -static inline void __shadow_set_pl2e( 65.126 +static inline void __shadow_set_l2e( 65.127 struct exec_domain *ed, unsigned long va, unsigned long value) 65.128 { 65.129 - if ( ed->domain->arch.shadow_mode == SHM_full_32 ) 65.130 + if ( shadow_mode(ed->domain) == SHM_full_32 ) { 65.131 ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value); 65.132 + } 65.133 + else if ( shadow_mode(ed->domain) ) { 65.134 + shadow_linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value); 65.135 + } 65.136 else 65.137 linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value); 65.138 } 65.139 65.140 -static inline void __guest_get_pl2e( 65.141 +static inline void __guest_get_l2e( 65.142 struct exec_domain *ed, unsigned long va, unsigned long *l2e) 65.143 { 65.144 - *l2e = (ed->domain->arch.shadow_mode == SHM_full_32) ? 65.145 + *l2e = ( shadow_mode(ed->domain) == SHM_full_32) ? 65.146 l2_pgentry_val(ed->arch.vpagetable[l2_table_offset(va)]) : 65.147 l2_pgentry_val(linear_l2_table[l2_table_offset(va)]); 65.148 } 65.149 65.150 -static inline void __guest_set_pl2e( 65.151 +static inline void __guest_set_l2e( 65.152 struct exec_domain *ed, unsigned long va, unsigned long value) 65.153 { 65.154 - if ( ed->domain->arch.shadow_mode == SHM_full_32 ) 65.155 + if ( shadow_mode(ed->domain) == SHM_full_32 ) 65.156 { 65.157 unsigned long pfn; 65.158 65.159 - pfn = phys_to_machine_mapping[value >> PAGE_SHIFT]; 65.160 + pfn = phys_to_machine_mapping(value >> PAGE_SHIFT); 65.161 ed->arch.guest_pl2e_cache[l2_table_offset(va)] = 65.162 mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); 65.163 65.164 @@ -213,32 +226,18 @@ static inline void l1pte_write_fault( 65.165 { 65.166 unsigned long gpte = *gpte_p; 65.167 unsigned long spte = *spte_p; 65.168 + unsigned long pfn = gpte >> PAGE_SHIFT; 65.169 + unsigned long mfn = __gpfn_to_mfn(d, pfn); 65.170 65.171 ASSERT(gpte & _PAGE_RW); 65.172 gpte |= _PAGE_DIRTY | _PAGE_ACCESSED; 65.173 65.174 - switch ( d->arch.shadow_mode ) 65.175 - { 65.176 - case SHM_test: 65.177 - spte = gpte | _PAGE_RW; 65.178 - break; 65.179 - 65.180 - case SHM_logdirty: 65.181 - spte = gpte | _PAGE_RW; 65.182 - __mark_dirty(d, gpte >> PAGE_SHIFT); 65.183 + if ( shadow_mode(d) == SHM_logdirty ) 65.184 + __mark_dirty(d, pfn); 65.185 65.186 - case SHM_full_32: 65.187 - { 65.188 - unsigned long host_pfn, host_gpte; 65.189 - 65.190 - host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT]; 65.191 - host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); 65.192 - spte = host_gpte | _PAGE_RW; 65.193 - } 65.194 - break; 65.195 - } 65.196 + spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); 65.197 65.198 - SH_VVLOG("updating spte=%lx gpte=%lx", spte, gpte); 65.199 + SH_VVLOG("l1pte_write_fault: updating spte=0x%08lx gpte=0x%08lx", spte, gpte); 65.200 *gpte_p = gpte; 65.201 *spte_p = spte; 65.202 } 65.203 @@ -248,31 +247,16 @@ static inline void l1pte_read_fault( 65.204 { 65.205 unsigned long gpte = *gpte_p; 65.206 unsigned long spte = *spte_p; 65.207 + unsigned long pfn = gpte >> PAGE_SHIFT; 65.208 + unsigned long mfn = __gpfn_to_mfn(d, pfn); 65.209 65.210 gpte |= _PAGE_ACCESSED; 65.211 - 65.212 - switch ( d->arch.shadow_mode ) 65.213 - { 65.214 - case SHM_test: 65.215 - spte = (gpte & _PAGE_DIRTY) ? gpte : (gpte & ~_PAGE_RW); 65.216 - break; 65.217 - 65.218 - case SHM_logdirty: 65.219 - spte = gpte & ~_PAGE_RW; 65.220 - break; 65.221 + spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); 65.222 65.223 - case SHM_full_32: 65.224 - { 65.225 - unsigned long host_pfn, host_gpte; 65.226 - 65.227 - host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT]; 65.228 - host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); 65.229 - spte = (host_gpte & _PAGE_DIRTY) ? host_gpte : (host_gpte & ~_PAGE_RW); 65.230 - } 65.231 - break; 65.232 + if ( (shadow_mode(d) == SHM_logdirty) || ! (gpte & _PAGE_DIRTY) ) 65.233 + spte &= ~_PAGE_RW; 65.234 65.235 - } 65.236 - 65.237 + SH_VVLOG("l1pte_read_fault: updating spte=0x%08lx gpte=0x%08lx", spte, gpte); 65.238 *gpte_p = gpte; 65.239 *spte_p = spte; 65.240 } 65.241 @@ -283,8 +267,11 @@ static inline void l1pte_propagate_from_ 65.242 unsigned long gpte = *gpte_p; 65.243 unsigned long spte = *spte_p; 65.244 unsigned long host_pfn, host_gpte; 65.245 +#if SHADOW_VERBOSE_DEBUG 65.246 + unsigned long old_spte = spte; 65.247 +#endif 65.248 65.249 - switch ( d->arch.shadow_mode ) 65.250 + switch ( shadow_mode(d) ) 65.251 { 65.252 case SHM_test: 65.253 spte = 0; 65.254 @@ -309,7 +296,7 @@ static inline void l1pte_propagate_from_ 65.255 return; 65.256 } 65.257 65.258 - host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT]; 65.259 + host_pfn = phys_to_machine_mapping(gpte >> PAGE_SHIFT); 65.260 host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); 65.261 65.262 if ( (host_gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 65.263 @@ -320,6 +307,11 @@ static inline void l1pte_propagate_from_ 65.264 break; 65.265 } 65.266 65.267 +#if SHADOW_VERBOSE_DEBUG 65.268 + if ( old_spte || spte || gpte ) 65.269 + SH_VVLOG("l1pte_propagate_from_guest: gpte=0x%08lx, old spte=0x%08lx, new spte=0x%08lx ", gpte, old_spte, spte); 65.270 +#endif 65.271 + 65.272 *gpte_p = gpte; 65.273 *spte_p = spte; 65.274 } 65.275 @@ -328,24 +320,24 @@ static inline void l2pde_general( 65.276 struct domain *d, 65.277 unsigned long *gpde_p, 65.278 unsigned long *spde_p, 65.279 - unsigned long sl1pfn) 65.280 + unsigned long sl1mfn) 65.281 { 65.282 unsigned long gpde = *gpde_p; 65.283 unsigned long spde = *spde_p; 65.284 65.285 spde = 0; 65.286 65.287 - if ( sl1pfn != 0 ) 65.288 + if ( sl1mfn != 0 ) 65.289 { 65.290 - spde = (gpde & ~PAGE_MASK) | (sl1pfn << PAGE_SHIFT) | 65.291 + spde = (gpde & ~PAGE_MASK) | (sl1mfn << PAGE_SHIFT) | 65.292 _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY; 65.293 gpde |= _PAGE_ACCESSED | _PAGE_DIRTY; 65.294 65.295 /* Detect linear p.t. mappings and write-protect them. */ 65.296 - if ( (frame_table[sl1pfn].u.inuse.type_info & PGT_type_mask) == 65.297 + if ( (frame_table[sl1mfn].u.inuse.type_info & PGT_type_mask) == 65.298 PGT_l2_page_table ) 65.299 { 65.300 - if ( d->arch.shadow_mode != SHM_full_32 ) 65.301 + if ( shadow_mode(d) != SHM_full_32 ) 65.302 spde = gpde & ~_PAGE_RW; 65.303 65.304 } 65.305 @@ -366,20 +358,20 @@ static void shadow_audit(struct domain * 65.306 for ( j = 0; j < shadow_ht_buckets; j++ ) 65.307 { 65.308 a = &d->arch.shadow_ht[j]; 65.309 - if ( a->pfn ) { live++; ASSERT(a->spfn_and_flags & PSH_pfn_mask); } 65.310 + if ( a->pfn ) { live++; ASSERT(a->smfn_and_flags & PSH_pfn_mask); } 65.311 ASSERT(a->pfn < 0x00100000UL); 65.312 a = a->next; 65.313 while ( a && (live < 9999) ) 65.314 { 65.315 live++; 65.316 - if ( (a->pfn == 0) || (a->spfn_and_flags == 0) ) 65.317 + if ( (a->pfn == 0) || (a->smfn_and_flags == 0) ) 65.318 { 65.319 printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n", 65.320 - live, a->pfn, a->spfn_and_flags, a->next); 65.321 + live, a->pfn, a->smfn_and_flags, a->next); 65.322 BUG(); 65.323 } 65.324 ASSERT(a->pfn < 0x00100000UL); 65.325 - ASSERT(a->spfn_and_flags & PSH_pfn_mask); 65.326 + ASSERT(a->smfn_and_flags & PSH_pfn_mask); 65.327 a = a->next; 65.328 } 65.329 ASSERT(live < 9999); 65.330 @@ -411,6 +403,12 @@ static inline struct shadow_status *hash 65.331 } 65.332 65.333 65.334 +/* 65.335 + * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace, 65.336 + * which, depending on full shadow mode, may or may not equal 65.337 + * its mfn). 65.338 + * The shadow status it returns is a mfn. 65.339 + */ 65.340 static inline unsigned long __shadow_status( 65.341 struct domain *d, unsigned int gpfn) 65.342 { 65.343 @@ -419,7 +417,7 @@ static inline unsigned long __shadow_sta 65.344 x = head = hash_bucket(d, gpfn); 65.345 p = NULL; 65.346 65.347 - SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, x); 65.348 + //SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, x); 65.349 shadow_audit(d, 0); 65.350 65.351 do 65.352 @@ -438,10 +436,12 @@ static inline unsigned long __shadow_sta 65.353 65.354 /* Swap 'x' contents with head contents. */ 65.355 SWAP(head->pfn, x->pfn); 65.356 - SWAP(head->spfn_and_flags, x->spfn_and_flags); 65.357 + SWAP(head->smfn_and_flags, x->smfn_and_flags); 65.358 } 65.359 65.360 - return head->spfn_and_flags; 65.361 + SH_VVLOG("lookup gpfn=%08lx => status=%08lx", 65.362 + gpfn, head->smfn_and_flags); 65.363 + return head->smfn_and_flags; 65.364 } 65.365 65.366 p = x; 65.367 @@ -449,6 +449,7 @@ static inline unsigned long __shadow_sta 65.368 } 65.369 while ( x != NULL ); 65.370 65.371 + SH_VVLOG("lookup gpfn=%08lx => status=0", gpfn); 65.372 return 0; 65.373 } 65.374 65.375 @@ -462,7 +463,7 @@ static inline unsigned long get_shadow_s 65.376 { 65.377 unsigned long res; 65.378 65.379 - ASSERT(d->arch.shadow_mode); 65.380 + ASSERT(shadow_mode(d)); 65.381 65.382 /* 65.383 * If we get here we know that some sort of update has happened to the 65.384 @@ -474,7 +475,7 @@ static inline unsigned long get_shadow_s 65.385 65.386 shadow_lock(d); 65.387 65.388 - if ( d->arch.shadow_mode == SHM_logdirty ) 65.389 + if ( shadow_mode(d) == SHM_logdirty ) 65.390 __mark_dirty(d, gpfn); 65.391 65.392 if ( !(res = __shadow_status(d, gpfn)) ) 65.393 @@ -511,14 +512,14 @@ static inline void delete_shadow_status( 65.394 { 65.395 /* Overwrite head with contents of following node. */ 65.396 head->pfn = n->pfn; 65.397 - head->spfn_and_flags = n->spfn_and_flags; 65.398 + head->smfn_and_flags = n->smfn_and_flags; 65.399 65.400 /* Delete following node. */ 65.401 head->next = n->next; 65.402 65.403 /* Add deleted node to the free list. */ 65.404 n->pfn = 0; 65.405 - n->spfn_and_flags = 0; 65.406 + n->smfn_and_flags = 0; 65.407 n->next = d->arch.shadow_ht_free; 65.408 d->arch.shadow_ht_free = n; 65.409 } 65.410 @@ -526,7 +527,7 @@ static inline void delete_shadow_status( 65.411 { 65.412 /* This bucket is now empty. Initialise the head node. */ 65.413 head->pfn = 0; 65.414 - head->spfn_and_flags = 0; 65.415 + head->smfn_and_flags = 0; 65.416 } 65.417 65.418 goto found; 65.419 @@ -544,7 +545,7 @@ static inline void delete_shadow_status( 65.420 65.421 /* Add deleted node to the free list. */ 65.422 x->pfn = 0; 65.423 - x->spfn_and_flags = 0; 65.424 + x->smfn_and_flags = 0; 65.425 x->next = d->arch.shadow_ht_free; 65.426 d->arch.shadow_ht_free = x; 65.427 65.428 @@ -587,7 +588,7 @@ static inline void set_shadow_status( 65.429 { 65.430 if ( x->pfn == gpfn ) 65.431 { 65.432 - x->spfn_and_flags = s; 65.433 + x->smfn_and_flags = s; 65.434 goto done; 65.435 } 65.436 65.437 @@ -603,7 +604,7 @@ static inline void set_shadow_status( 65.438 if ( head->pfn == 0 ) 65.439 { 65.440 head->pfn = gpfn; 65.441 - head->spfn_and_flags = s; 65.442 + head->smfn_and_flags = s; 65.443 ASSERT(head->next == NULL); 65.444 goto done; 65.445 } 65.446 @@ -643,7 +644,7 @@ static inline void set_shadow_status( 65.447 65.448 /* Initialise the new node and insert directly after the head item. */ 65.449 x->pfn = gpfn; 65.450 - x->spfn_and_flags = s; 65.451 + x->smfn_and_flags = s; 65.452 x->next = head->next; 65.453 head->next = x; 65.454 65.455 @@ -652,10 +653,9 @@ static inline void set_shadow_status( 65.456 } 65.457 65.458 #ifdef CONFIG_VMX 65.459 -#include <asm/domain_page.h> 65.460 65.461 static inline void vmx_update_shadow_state( 65.462 - struct exec_domain *ed, unsigned long gpfn, unsigned long spfn) 65.463 + struct exec_domain *ed, unsigned long gpfn, unsigned long smfn) 65.464 { 65.465 65.466 l2_pgentry_t *mpl2e = 0; 65.467 @@ -672,70 +672,46 @@ static inline void vmx_update_shadow_sta 65.468 map_domain_mem(pagetable_val(ed->arch.monitor_table)); 65.469 65.470 mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = 65.471 - mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); 65.472 + mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); 65.473 __flush_tlb_one(SH_LINEAR_PT_VIRT_START); 65.474 65.475 - spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT); 65.476 + spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT); 65.477 gpl2e = (l2_pgentry_t *)map_domain_mem(gpfn << PAGE_SHIFT); 65.478 memset(spl2e, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); 65.479 65.480 - ed->arch.shadow_table = mk_pagetable(spfn<<PAGE_SHIFT); 65.481 ed->arch.shadow_vtable = spl2e; 65.482 ed->arch.vpagetable = gpl2e; /* expect the guest did clean this up */ 65.483 unmap_domain_mem(mpl2e); 65.484 } 65.485 65.486 +#endif /* CONFIG_VMX */ 65.487 + 65.488 static inline void __shadow_mk_pagetable(struct exec_domain *ed) 65.489 { 65.490 struct domain *d = ed->domain; 65.491 unsigned long gpfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT; 65.492 - unsigned long spfn; 65.493 - SH_VLOG("0: __shadow_mk_pagetable(gpfn=%08lx\n", gpfn); 65.494 + unsigned long smfn = __shadow_status(d, gpfn) & PSH_pfn_mask; 65.495 65.496 - if (d->arch.shadow_mode == SHM_full_32) 65.497 - { 65.498 - unsigned long guest_gpfn; 65.499 - guest_gpfn = machine_to_phys_mapping[gpfn]; 65.500 - 65.501 - SH_VVLOG("__shadow_mk_pagetable(guest_gpfn=%08lx, gpfn=%08lx\n", 65.502 - guest_gpfn, gpfn); 65.503 + SH_VVLOG("0: __shadow_mk_pagetable(gpfn=%08lx, smfn=%08lx)", gpfn, smfn); 65.504 65.505 - spfn = __shadow_status(d, guest_gpfn) & PSH_pfn_mask; 65.506 - if ( unlikely(spfn == 0) ) { 65.507 - spfn = shadow_l2_table(d, gpfn); 65.508 - ed->arch.shadow_table = mk_pagetable(spfn<<PAGE_SHIFT); 65.509 - } else { 65.510 - vmx_update_shadow_state(ed, gpfn, spfn); 65.511 - } 65.512 - } else { 65.513 - spfn = __shadow_status(d, gpfn) & PSH_pfn_mask; 65.514 + if ( unlikely(smfn == 0) ) 65.515 + smfn = shadow_l2_table(d, gpfn); 65.516 +#ifdef CONFIG_VMX 65.517 + else 65.518 + if (d->arch.shadow_mode == SHM_full_32) 65.519 + vmx_update_shadow_state(ed, gpfn, smfn); 65.520 +#endif 65.521 65.522 - if ( unlikely(spfn == 0) ) { 65.523 - spfn = shadow_l2_table(d, gpfn); 65.524 - } 65.525 - ed->arch.shadow_table = mk_pagetable(spfn<<PAGE_SHIFT); 65.526 - } 65.527 + ed->arch.shadow_table = mk_pagetable(smfn<<PAGE_SHIFT); 65.528 } 65.529 -#else 65.530 -static inline void __shadow_mk_pagetable(struct exec_domain *ed) 65.531 -{ 65.532 - unsigned long gpfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT; 65.533 - unsigned long spfn = __shadow_status(ed->domain, gpfn); 65.534 - 65.535 - if ( unlikely(spfn == 0) ) 65.536 - spfn = shadow_l2_table(ed->domain, gpfn); 65.537 - 65.538 - ed->arch.shadow_table = mk_pagetable(spfn << PAGE_SHIFT); 65.539 -} 65.540 -#endif /* CONFIG_VMX */ 65.541 65.542 static inline void shadow_mk_pagetable(struct exec_domain *ed) 65.543 { 65.544 - if ( unlikely(ed->domain->arch.shadow_mode) ) 65.545 + if ( unlikely(shadow_mode(ed->domain)) ) 65.546 { 65.547 SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )", 65.548 pagetable_val(ed->arch.pagetable), 65.549 - ed->domain->arch.shadow_mode); 65.550 + shadow_mode(ed->domain)); 65.551 65.552 shadow_lock(ed->domain); 65.553 __shadow_mk_pagetable(ed); 65.554 @@ -744,13 +720,13 @@ static inline void shadow_mk_pagetable(s 65.555 SH_VVLOG("leaving shadow_mk_pagetable:\n" 65.556 "( gptbase=%08lx, mode=%d ) sh=%08lx", 65.557 pagetable_val(ed->arch.pagetable), 65.558 - ed->domain->arch.shadow_mode, 65.559 + shadow_mode(ed->domain), 65.560 pagetable_val(ed->arch.shadow_table) ); 65.561 } 65.562 } 65.563 65.564 #if SHADOW_DEBUG 65.565 -extern int check_pagetable(struct domain *d, pagetable_t pt, char *s); 65.566 +extern void check_pagetable(struct domain *d, pagetable_t pt, char *s); 65.567 #else 65.568 #define check_pagetable(d, pt, s) ((void)0) 65.569 #endif
66.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 66.2 +++ b/xen/include/asm-x86/x86_32/page.h Tue Feb 08 16:44:16 2005 +0000 66.3 @@ -0,0 +1,56 @@ 66.4 +/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ 66.5 + 66.6 +#ifndef __X86_32_PAGE_H__ 66.7 +#define __X86_32_PAGE_H__ 66.8 + 66.9 +#define L1_PAGETABLE_SHIFT 12 66.10 +#define L2_PAGETABLE_SHIFT 22 66.11 +#define PAGE_SHIFT L1_PAGETABLE_SHIFT 66.12 + 66.13 +#define ENTRIES_PER_L1_PAGETABLE 1024 66.14 +#define ENTRIES_PER_L2_PAGETABLE 1024 66.15 + 66.16 +#define __PAGE_OFFSET (0xFC400000) 66.17 + 66.18 +#ifndef __ASSEMBLY__ 66.19 +#include <xen/config.h> 66.20 +typedef struct { unsigned long l1_lo; } l1_pgentry_t; 66.21 +typedef struct { unsigned long l2_lo; } l2_pgentry_t; 66.22 +#endif /* !__ASSEMBLY__ */ 66.23 + 66.24 +/* Strip type from a table entry. */ 66.25 +#define l1_pgentry_val(_x) ((_x).l1_lo) 66.26 +#define l2_pgentry_val(_x) ((_x).l2_lo) 66.27 + 66.28 +/* Add type to a table entry. */ 66.29 +#define mk_l1_pgentry(_x) ( (l1_pgentry_t) { (_x) } ) 66.30 +#define mk_l2_pgentry(_x) ( (l2_pgentry_t) { (_x) } ) 66.31 + 66.32 +/* Turn a typed table entry into a physical address. */ 66.33 +#define l1_pgentry_to_phys(_x) (l1_pgentry_val(_x) & PAGE_MASK) 66.34 +#define l2_pgentry_to_phys(_x) (l2_pgentry_val(_x) & PAGE_MASK) 66.35 + 66.36 +/* Turn a typed table entry into a page index. */ 66.37 +#define l1_pgentry_to_pfn(_x) (l1_pgentry_val(_x) >> PAGE_SHIFT) 66.38 +#define l2_pgentry_to_pfn(_x) (l2_pgentry_val(_x) >> PAGE_SHIFT) 66.39 + 66.40 +/* Pagetable walking. */ 66.41 +#define l2_pgentry_to_l1(_x) \ 66.42 + ((l1_pgentry_t *)__va(l2_pgentry_to_phys(_x))) 66.43 + 66.44 +/* Given a virtual address, get an entry offset into a page table. */ 66.45 +#define l1_table_offset(_a) \ 66.46 + (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1)) 66.47 +#define l2_table_offset(_a) \ 66.48 + ((_a) >> L2_PAGETABLE_SHIFT) 66.49 + 66.50 +/* Given a virtual address, get an entry offset into a linear page table. */ 66.51 +#define l1_linear_offset(_a) ((_a) >> PAGE_SHIFT) 66.52 + 66.53 +/* Root page-table definitions. */ 66.54 +#define pagetable_t l2_pgentry_t 66.55 +#define pagetable_val(_x) ((_x).l2_lo) 66.56 +#define mk_pagetable(_x) ( (l2_pgentry_t) { (_x) } ) 66.57 +#define ENTRIES_PER_PAGETABLE ENTRIES_PER_L2_PAGETABLE 66.58 + 66.59 +#endif /* __X86_32_PAGE_H__ */
67.1 --- a/xen/include/asm-x86/x86_32/regs.h Mon Feb 07 08:19:24 2005 +0000 67.2 +++ b/xen/include/asm-x86/x86_32/regs.h Tue Feb 08 16:44:16 2005 +0000 67.3 @@ -39,4 +39,6 @@ struct xen_regs 67.4 #define RING_2(_r) (((_r)->cs & 3) == 2) 67.5 #define RING_3(_r) (((_r)->cs & 3) == 3) 67.6 67.7 +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_1(_r)) 67.8 + 67.9 #endif
68.1 --- a/xen/include/asm-x86/x86_32/uaccess.h Mon Feb 07 08:19:24 2005 +0000 68.2 +++ b/xen/include/asm-x86/x86_32/uaccess.h Tue Feb 08 16:44:16 2005 +0000 68.3 @@ -8,7 +8,6 @@ 68.4 #include <xen/errno.h> 68.5 #include <xen/prefetch.h> 68.6 #include <xen/string.h> 68.7 -#include <xen/sched.h> 68.8 68.9 #define __user 68.10
69.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 69.2 +++ b/xen/include/asm-x86/x86_64/page.h Tue Feb 08 16:44:16 2005 +0000 69.3 @@ -0,0 +1,84 @@ 69.4 +/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ 69.5 + 69.6 +#ifndef __X86_64_PAGE_H__ 69.7 +#define __X86_64_PAGE_H__ 69.8 + 69.9 +#define L1_PAGETABLE_SHIFT 12 69.10 +#define L2_PAGETABLE_SHIFT 21 69.11 +#define L3_PAGETABLE_SHIFT 30 69.12 +#define L4_PAGETABLE_SHIFT 39 69.13 +#define PAGE_SHIFT L1_PAGETABLE_SHIFT 69.14 + 69.15 +#define ENTRIES_PER_L1_PAGETABLE 512 69.16 +#define ENTRIES_PER_L2_PAGETABLE 512 69.17 +#define ENTRIES_PER_L3_PAGETABLE 512 69.18 +#define ENTRIES_PER_L4_PAGETABLE 512 69.19 + 69.20 +#define __PAGE_OFFSET (0xFFFF830000000000) 69.21 + 69.22 +/* These may increase in future (phys. bits in particular). */ 69.23 +#define PADDR_BITS 40 69.24 +#define VADDR_BITS 48 69.25 +#define PADDR_MASK ((1UL << PADDR_BITS)-1) 69.26 +#define VADDR_MASK ((1UL << VADDR_BITS)-1) 69.27 + 69.28 +#ifndef __ASSEMBLY__ 69.29 +#include <xen/config.h> 69.30 +typedef struct { unsigned long l1_lo; } l1_pgentry_t; 69.31 +typedef struct { unsigned long l2_lo; } l2_pgentry_t; 69.32 +typedef struct { unsigned long l3_lo; } l3_pgentry_t; 69.33 +typedef struct { unsigned long l4_lo; } l4_pgentry_t; 69.34 +#endif /* !__ASSEMBLY__ */ 69.35 + 69.36 +/* Strip type from a table entry. */ 69.37 +#define l1_pgentry_val(_x) ((_x).l1_lo) 69.38 +#define l2_pgentry_val(_x) ((_x).l2_lo) 69.39 +#define l3_pgentry_val(_x) ((_x).l3_lo) 69.40 +#define l4_pgentry_val(_x) ((_x).l4_lo) 69.41 + 69.42 +/* Add type to a table entry. */ 69.43 +#define mk_l1_pgentry(_x) ( (l1_pgentry_t) { (_x) } ) 69.44 +#define mk_l2_pgentry(_x) ( (l2_pgentry_t) { (_x) } ) 69.45 +#define mk_l3_pgentry(_x) ( (l3_pgentry_t) { (_x) } ) 69.46 +#define mk_l4_pgentry(_x) ( (l4_pgentry_t) { (_x) } ) 69.47 + 69.48 +/* Turn a typed table entry into a physical address. */ 69.49 +#define l1_pgentry_to_phys(_x) (l1_pgentry_val(_x) & (PADDR_MASK & PAGE_MASK)) 69.50 +#define l2_pgentry_to_phys(_x) (l2_pgentry_val(_x) & (PADDR_MASK & PAGE_MASK)) 69.51 +#define l3_pgentry_to_phys(_x) (l3_pgentry_val(_x) & (PADDR_MASK & PAGE_MASK)) 69.52 +#define l4_pgentry_to_phys(_x) (l4_pgentry_val(_x) & (PADDR_MASK & PAGE_MASK)) 69.53 + 69.54 +/* Turn a typed table entry into a page index. */ 69.55 +#define l1_pgentry_to_pfn(_x) (l1_pgentry_val(_x) >> PAGE_SHIFT) 69.56 +#define l2_pgentry_to_pfn(_x) (l2_pgentry_val(_x) >> PAGE_SHIFT) 69.57 +#define l3_pgentry_to_pfn(_x) (l3_pgentry_val(_x) >> PAGE_SHIFT) 69.58 +#define l4_pgentry_to_pfn(_x) (l4_pgentry_val(_x) >> PAGE_SHIFT) 69.59 + 69.60 +/* Pagetable walking. */ 69.61 +#define l2_pgentry_to_l1(_x) \ 69.62 + ((l1_pgentry_t *)__va(l2_pgentry_to_phys(_x))) 69.63 +#define l3_pgentry_to_l2(_x) \ 69.64 + ((l2_pgentry_t *)__va(l3_pgentry_to_phys(_x))) 69.65 +#define l4_pgentry_to_l3(_x) \ 69.66 + ((l3_pgentry_t *)__va(l4_pgentry_to_phys(_x))) 69.67 + 69.68 +/* Given a virtual address, get an entry offset into a page table. */ 69.69 +#define l1_table_offset(_a) \ 69.70 + (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1)) 69.71 +#define l2_table_offset(_a) \ 69.72 + (((_a) >> L2_PAGETABLE_SHIFT) & (ENTRIES_PER_L2_PAGETABLE - 1)) 69.73 +#define l3_table_offset(_a) \ 69.74 + (((_a) >> L3_PAGETABLE_SHIFT) & (ENTRIES_PER_L3_PAGETABLE - 1)) 69.75 +#define l4_table_offset(_a) \ 69.76 + (((_a) >> L4_PAGETABLE_SHIFT) & (ENTRIES_PER_L4_PAGETABLE - 1)) 69.77 + 69.78 +/* Given a virtual address, get an entry offset into a linear page table. */ 69.79 +#define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> PAGE_SHIFT) 69.80 + 69.81 +/* Root page-table definitions. */ 69.82 +#define pagetable_t l4_pgentry_t 69.83 +#define pagetable_val(_x) ((_x).l4_lo) 69.84 +#define mk_pagetable(_x) ( (l4_pgentry_t) { (_x) } ) 69.85 +#define ENTRIES_PER_PAGETABLE ENTRIES_PER_L4_PAGETABLE 69.86 + 69.87 +#endif /* __X86_64_PAGE_H__ */
70.1 --- a/xen/include/asm-x86/x86_64/regs.h Mon Feb 07 08:19:24 2005 +0000 70.2 +++ b/xen/include/asm-x86/x86_64/regs.h Tue Feb 08 16:44:16 2005 +0000 70.3 @@ -36,4 +36,6 @@ struct xen_regs 70.4 #define RING_2(_r) (((_r)->cs & 3) == 2) 70.5 #define RING_3(_r) (((_r)->cs & 3) == 3) 70.6 70.7 +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_3(_r)) 70.8 + 70.9 #endif
71.1 --- a/xen/include/asm-x86/x86_64/uaccess.h Mon Feb 07 08:19:24 2005 +0000 71.2 +++ b/xen/include/asm-x86/x86_64/uaccess.h Tue Feb 08 16:44:16 2005 +0000 71.3 @@ -7,7 +7,6 @@ 71.4 #include <xen/config.h> 71.5 #include <xen/compiler.h> 71.6 #include <xen/errno.h> 71.7 -#include <xen/sched.h> 71.8 #include <xen/prefetch.h> 71.9 #include <asm/page.h> 71.10 71.11 @@ -16,34 +15,19 @@ 71.12 #define VERIFY_READ 0 71.13 #define VERIFY_WRITE 1 71.14 71.15 -#define __addr_ok(addr) ((unsigned long)(addr) < HYPERVISOR_VIRT_START) 71.16 - 71.17 /* 71.18 - * Test whether a block of memory is a valid user space address. 71.19 - * Returns 0 if the range is valid, nonzero otherwise. 71.20 - * 71.21 - * This is equivalent to the following test: 71.22 - * ((u65)addr >= (u65)HYPERVISOR_VIRT_END) ? 71.23 - * (((u65)addr + (u65)size) >= ((u65)1 << 64)) : 71.24 - * (((u65)addr + (u65)size) >= ((u65)HYPERVISOR_VIRT_START)) 71.25 + * Valid if in +ve half of 48-bit address space, or above Xen-reserved area. 71.26 + * This is also valid for range checks (addr, addr+size). As long as the 71.27 + * start address is outside the Xen-reserved area then we will access a 71.28 + * non-canonical address (and thus fault) before ever reaching VIRT_START. 71.29 */ 71.30 -#define __range_not_ok(addr,size) ({ \ 71.31 - unsigned long flag,sum; \ 71.32 - if ((unsigned long)addr >= HYPERVISOR_VIRT_END) \ 71.33 - asm("addq %3,%1 ; sbbq %0,%0" \ 71.34 - :"=&r" (flag), "=r" (sum) \ 71.35 - :"1" (addr),"g" ((long)(size))); \ 71.36 - else \ 71.37 - asm("addq %3,%1 ; sbbq %0,%0 ; cmpq %1,%4 ; sbbq $0,%0" \ 71.38 - :"=&r" (flag), "=r" (sum) \ 71.39 - :"1" (addr),"g" ((long)(size)),"r" (HYPERVISOR_VIRT_START)); \ 71.40 - flag; }) 71.41 +#define __addr_ok(addr) \ 71.42 + (((unsigned long)(addr) < (1UL<<48)) || \ 71.43 + ((unsigned long)(addr) >= HYPERVISOR_VIRT_END)) 71.44 71.45 -#define access_ok(type, addr, size) (__range_not_ok(addr,size) == 0) 71.46 +#define access_ok(type, addr, size) (__addr_ok(addr)) 71.47 71.48 -#define array_access_ok(type,addr,count,size) \ 71.49 - (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ && \ 71.50 - access_ok(type,addr,(unsigned long)count*(unsigned long)size)) 71.51 +#define array_access_ok(type,addr,count,size) (__addr_ok(addr)) 71.52 71.53 extern long __get_user_bad(void); 71.54 extern void __put_user_bad(void);
72.1 --- a/xen/include/public/arch-x86_64.h Mon Feb 07 08:19:24 2005 +0000 72.2 +++ b/xen/include/public/arch-x86_64.h Tue Feb 08 16:44:16 2005 +0000 72.3 @@ -43,11 +43,11 @@ 72.4 */ 72.5 72.6 #define FLAT_RING3_CS32 0x0823 /* GDT index 260 */ 72.7 -#define FLAT_RING3_CS64 0x082b /* GDT index 261 */ 72.8 -#define FLAT_RING3_DS32 0x0833 /* GDT index 262 */ 72.9 +#define FLAT_RING3_CS64 0x0833 /* GDT index 261 */ 72.10 +#define FLAT_RING3_DS32 0x082b /* GDT index 262 */ 72.11 #define FLAT_RING3_DS64 0x0000 /* NULL selector */ 72.12 -#define FLAT_RING3_SS32 0x0833 /* GDT index 262 */ 72.13 -#define FLAT_RING3_SS64 0x0833 /* GDT index 262 */ 72.14 +#define FLAT_RING3_SS32 0x082b /* GDT index 262 */ 72.15 +#define FLAT_RING3_SS64 0x082b /* GDT index 262 */ 72.16 72.17 #define FLAT_GUESTOS_DS64 FLAT_RING3_DS64 72.18 #define FLAT_GUESTOS_DS32 FLAT_RING3_DS32
73.1 --- a/xen/include/public/xen.h Mon Feb 07 08:19:24 2005 +0000 73.2 +++ b/xen/include/public/xen.h Tue Feb 08 16:44:16 2005 +0000 73.3 @@ -23,7 +23,14 @@ 73.4 * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). 73.5 */ 73.6 73.7 -/* EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5. */ 73.8 +/* 73.9 + * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5. 73.10 + * EAX = return value 73.11 + * (argument registers may be clobbered on return) 73.12 + * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. 73.13 + * RAX = return value 73.14 + * (argument registers not clobbered on return; RCX, R11 are) 73.15 + */ 73.16 #define __HYPERVISOR_set_trap_table 0 73.17 #define __HYPERVISOR_mmu_update 1 73.18 #define __HYPERVISOR_set_gdt 2
74.1 --- a/xen/include/xen/ioport.h Mon Feb 07 08:19:24 2005 +0000 74.2 +++ b/xen/include/xen/ioport.h Tue Feb 08 16:44:16 2005 +0000 74.3 @@ -100,17 +100,13 @@ extern int allocate_resource(struct reso 74.4 #define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name)) 74.5 #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name)) 74.6 74.7 -extern struct resource * __request_region(struct resource *, unsigned long start, unsigned long n, const char *name); 74.8 - 74.9 -/* Compatibility cruft */ 74.10 -#define check_region(start,n) __check_region(&ioport_resource, (start), (n)) 74.11 #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) 74.12 -#define check_mem_region(start,n) __check_region(&iomem_resource, (start), (n)) 74.13 #define release_mem_region(start,n) __release_region(&iomem_resource, (start), (n)) 74.14 74.15 -extern int __check_region(struct resource *, unsigned long, unsigned long); 74.16 extern void __release_region(struct resource *, unsigned long, unsigned long); 74.17 74.18 +extern struct resource * __request_region(struct resource *, unsigned long start, unsigned long n, const char *name); 74.19 + 74.20 #define get_ioport_list(buf) get_resource_list(&ioport_resource, buf, PAGE_SIZE) 74.21 #define get_mem_list(buf) get_resource_list(&iomem_resource, buf, PAGE_SIZE) 74.22
75.1 --- a/xen/include/xen/sched.h Mon Feb 07 08:19:24 2005 +0000 75.2 +++ b/xen/include/xen/sched.h Tue Feb 08 16:44:16 2005 +0000 75.3 @@ -262,8 +262,32 @@ int idle_cpu(int cpu); /* Is CPU 'cpu' i 75.4 75.5 void startup_cpu_idle_loop(void); 75.6 75.7 -unsigned long hypercall_create_continuation( 75.8 +unsigned long __hypercall_create_continuation( 75.9 unsigned int op, unsigned int nr_args, ...); 75.10 +#define hypercall0_create_continuation(_op) \ 75.11 + __hypercall_create_continuation((_op), 0) 75.12 +#define hypercall1_create_continuation(_op, _a1) \ 75.13 + __hypercall_create_continuation((_op), 1, \ 75.14 + (unsigned long)(_a1)) 75.15 +#define hypercall2_create_continuation(_op, _a1, _a2) \ 75.16 + __hypercall_create_continuation((_op), 2, \ 75.17 + (unsigned long)(_a1), (unsigned long)(_a2)) 75.18 +#define hypercall3_create_continuation(_op, _a1, _a2, _a3) \ 75.19 + __hypercall_create_continuation((_op), 3, \ 75.20 + (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3)) 75.21 +#define hypercall4_create_continuation(_op, _a1, _a2, _a3, _a4) \ 75.22 + __hypercall_create_continuation((_op), 4, \ 75.23 + (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \ 75.24 + (unsigned long)(_a4)) 75.25 +#define hypercall5_create_continuation(_op, _a1, _a2, _a3, _a4, _a5) \ 75.26 + __hypercall_create_continuation((_op), 5, \ 75.27 + (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \ 75.28 + (unsigned long)(_a4), (unsigned long)(_a5)) 75.29 +#define hypercall6_create_continuation(_op, _a1, _a2, _a3, _a4, _a5, _a6) \ 75.30 + __hypercall_create_continuation((_op), 6, \ 75.31 + (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \ 75.32 + (unsigned long)(_a4), (unsigned long)(_a5), (unsigned long)(_a6)) 75.33 + 75.34 #define hypercall_preempt_check() \ 75.35 (unlikely(softirq_pending(smp_processor_id()))) 75.36