debuggers.hg
changeset 3758:9f7935ea4606
bitkeeper revision 1.1159.212.128 (4208d72fZEHIE9NOZZbr91V7R-3gUg)
Merge scramble.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into scramble.cl.cam.ac.uk:/local/scratch/kaf24/xen-unstable.bk
Merge scramble.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into scramble.cl.cam.ac.uk:/local/scratch/kaf24/xen-unstable.bk
line diff
1.1 --- a/.rootkeys Tue Feb 08 12:27:23 2005 +0000 1.2 +++ b/.rootkeys Tue Feb 08 15:13:51 2005 +0000 1.3 @@ -867,8 +867,8 @@ 3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/ 1.4 3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/x86/idle0_task.c 1.5 3ddb79bcKIkRR0kqWaJhe5VUDkMdxg xen/arch/x86/io_apic.c 1.6 3ddb79bdqfIcjkz_h9Hvtp8Tk_19Zw xen/arch/x86/irq.c 1.7 -40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/memory.c 1.8 41d54a76qfpO0VnbL2tYs0Jgt3W3XA xen/arch/x86/microcode.c 1.9 +40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/mm.c 1.10 3ddb79bdS4UeWWXDH-FaBKqcpMFcnw xen/arch/x86/mpparse.c 1.11 41aaf566Z4sTDgJ77eEg0TzzQ1ka6Q xen/arch/x86/mtrr/amd.c 1.12 41aaf566TOpOBXT00wwQGUh20f1rlA xen/arch/x86/mtrr/centaur.c
2.1 --- a/linux-2.4.29-xen-sparse/mm/memory.c Tue Feb 08 12:27:23 2005 +0000 2.2 +++ b/linux-2.4.29-xen-sparse/mm/memory.c Tue Feb 08 15:13:51 2005 +0000 2.3 @@ -915,7 +915,7 @@ static inline void establish_pte(struct 2.4 #ifdef CONFIG_XEN 2.5 if ( likely(vma->vm_mm == current->mm) ) { 2.6 XEN_flush_page_update_queue(); 2.7 - HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, UVMF_INVLPG); 2.8 + HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG); 2.9 } else { 2.10 set_pte(page_table, entry); 2.11 flush_tlb_page(vma, address); 2.12 @@ -1191,7 +1191,7 @@ static int do_swap_page(struct mm_struct 2.13 #ifdef CONFIG_XEN 2.14 if ( likely(vma->vm_mm == current->mm) ) { 2.15 XEN_flush_page_update_queue(); 2.16 - HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, pte, 0); 2.17 + HYPERVISOR_update_va_mapping(address, pte, 0); 2.18 } else { 2.19 set_pte(page_table, pte); 2.20 XEN_flush_page_update_queue(); 2.21 @@ -1247,7 +1247,7 @@ static int do_anonymous_page(struct mm_s 2.22 #ifdef CONFIG_XEN 2.23 if ( likely(vma->vm_mm == current->mm) ) { 2.24 XEN_flush_page_update_queue(); 2.25 - HYPERVISOR_update_va_mapping(addr>>PAGE_SHIFT, entry, 0); 2.26 + HYPERVISOR_update_va_mapping(addr, entry, 0); 2.27 } else { 2.28 set_pte(page_table, entry); 2.29 XEN_flush_page_update_queue(); 2.30 @@ -1333,7 +1333,7 @@ static int do_no_page(struct mm_struct * 2.31 #ifdef CONFIG_XEN 2.32 if ( likely(vma->vm_mm == current->mm) ) { 2.33 XEN_flush_page_update_queue(); 2.34 - HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, 0); 2.35 + HYPERVISOR_update_va_mapping(address, entry, 0); 2.36 } else { 2.37 set_pte(page_table, entry); 2.38 XEN_flush_page_update_queue();
3.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c Tue Feb 08 12:27:23 2005 +0000 3.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c Tue Feb 08 15:13:51 2005 +0000 3.3 @@ -95,7 +95,7 @@ static void fast_flush_area(int idx, int 3.4 for ( i = 0; i < nr_pages; i++ ) 3.5 { 3.6 mcl[i].op = __HYPERVISOR_update_va_mapping; 3.7 - mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT; 3.8 + mcl[i].args[0] = MMAP_VADDR(idx, i); 3.9 mcl[i].args[1] = 0; 3.10 mcl[i].args[2] = 0; 3.11 } 3.12 @@ -343,14 +343,14 @@ static void dispatch_probe(blkif_t *blki 3.13 3.14 #ifdef CONFIG_XEN_BLKDEV_TAP_BE 3.15 if ( HYPERVISOR_update_va_mapping_otherdomain( 3.16 - MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT, 3.17 + MMAP_VADDR(pending_idx, 0), 3.18 (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, 3.19 0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) ) 3.20 3.21 goto out; 3.22 #else 3.23 if ( HYPERVISOR_update_va_mapping_otherdomain( 3.24 - MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT, 3.25 + MMAP_VADDR(pending_idx, 0), 3.26 (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, 3.27 0, blkif->domid) ) 3.28 3.29 @@ -436,7 +436,7 @@ static void dispatch_rw_block_io(blkif_t 3.30 for ( i = 0; i < nr_psegs; i++ ) 3.31 { 3.32 mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; 3.33 - mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; 3.34 + mcl[i].args[0] = MMAP_VADDR(pending_idx, i); 3.35 mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot; 3.36 mcl[i].args[2] = 0; 3.37 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
4.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c Tue Feb 08 12:27:23 2005 +0000 4.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c Tue Feb 08 15:13:51 2005 +0000 4.3 @@ -234,7 +234,7 @@ static void net_rx_action(unsigned long 4.4 mmu[2].val = MMUEXT_REASSIGN_PAGE; 4.5 4.6 mcl[0].op = __HYPERVISOR_update_va_mapping; 4.7 - mcl[0].args[0] = vdata >> PAGE_SHIFT; 4.8 + mcl[0].args[0] = vdata; 4.9 mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL; 4.10 mcl[0].args[2] = 0; 4.11 mcl[1].op = __HYPERVISOR_mmu_update; 4.12 @@ -409,7 +409,7 @@ static void net_tx_action(unsigned long 4.13 { 4.14 pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; 4.15 mcl[0].op = __HYPERVISOR_update_va_mapping; 4.16 - mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; 4.17 + mcl[0].args[0] = MMAP_VADDR(pending_idx); 4.18 mcl[0].args[1] = 0; 4.19 mcl[0].args[2] = 0; 4.20 mcl++; 4.21 @@ -546,7 +546,7 @@ static void net_tx_action(unsigned long 4.22 skb_reserve(skb, 16); 4.23 4.24 mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain; 4.25 - mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; 4.26 + mcl[0].args[0] = MMAP_VADDR(pending_idx); 4.27 mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL; 4.28 mcl[0].args[2] = 0; 4.29 mcl[0].args[3] = netif->domid;
5.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c Tue Feb 08 12:27:23 2005 +0000 5.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c Tue Feb 08 15:13:51 2005 +0000 5.3 @@ -392,7 +392,7 @@ static void network_alloc_rx_buffers(str 5.4 = INVALID_P2M_ENTRY; 5.5 5.6 rx_mcl[i].op = __HYPERVISOR_update_va_mapping; 5.7 - rx_mcl[i].args[0] = (unsigned long)skb->head >> PAGE_SHIFT; 5.8 + rx_mcl[i].args[0] = (unsigned long)skb->head; 5.9 rx_mcl[i].args[1] = 0; 5.10 rx_mcl[i].args[2] = 0; 5.11 } 5.12 @@ -593,7 +593,7 @@ static int netif_poll(struct net_device 5.13 mmu->val = __pa(skb->head) >> PAGE_SHIFT; 5.14 mmu++; 5.15 mcl->op = __HYPERVISOR_update_va_mapping; 5.16 - mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT; 5.17 + mcl->args[0] = (unsigned long)skb->head; 5.18 mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL; 5.19 mcl->args[2] = 0; 5.20 mcl++;
6.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c Tue Feb 08 12:27:23 2005 +0000 6.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c Tue Feb 08 15:13:51 2005 +0000 6.3 @@ -191,7 +191,7 @@ static void fast_flush_area(int idx, int 6.4 for ( i = 0; i < nr_pages; i++ ) 6.5 { 6.6 mcl[i].op = __HYPERVISOR_update_va_mapping; 6.7 - mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT; 6.8 + mcl[i].args[0] = MMAP_VADDR(idx, i); 6.9 mcl[i].args[1] = 0; 6.10 mcl[i].args[2] = 0; 6.11 } 6.12 @@ -630,7 +630,7 @@ static void dispatch_usb_io(usbif_priv_t 6.13 i++, offset += PAGE_SIZE ) 6.14 { 6.15 mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; 6.16 - mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; 6.17 + mcl[i].args[0] = MMAP_VADDR(pending_idx, i); 6.18 mcl[i].args[1] = ((buffer_mach & PAGE_MASK) + offset) | remap_prot; 6.19 mcl[i].args[2] = 0; 6.20 mcl[i].args[3] = up->domid; 6.21 @@ -646,7 +646,7 @@ static void dispatch_usb_io(usbif_priv_t 6.22 { 6.23 /* Map in ISO schedule, if necessary. */ 6.24 mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; 6.25 - mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; 6.26 + mcl[i].args[0] = MMAP_VADDR(pending_idx, i); 6.27 mcl[i].args[1] = (req->iso_schedule & PAGE_MASK) | remap_prot; 6.28 mcl[i].args[2] = 0; 6.29 mcl[i].args[3] = up->domid;
7.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h Tue Feb 08 12:27:23 2005 +0000 7.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h Tue Feb 08 15:13:51 2005 +0000 7.3 @@ -426,7 +426,7 @@ extern pte_t *lookup_address(unsigned lo 7.4 if (__dirty) { \ 7.5 if ( likely((__vma)->vm_mm == current->mm) ) { \ 7.6 xen_flush_page_update_queue(); \ 7.7 - HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT, (__entry), UVMF_INVLPG); \ 7.8 + HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \ 7.9 } else { \ 7.10 xen_l1_entry_update((__ptep), (__entry).pte_low); \ 7.11 flush_tlb_page((__vma), (__address)); \ 7.12 @@ -445,7 +445,7 @@ do { \ 7.13 do { \ 7.14 if (likely((__vma)->vm_mm == current->mm)) { \ 7.15 xen_flush_page_update_queue(); \ 7.16 - HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT, \ 7.17 + HYPERVISOR_update_va_mapping((__address), \ 7.18 __entry, 0); \ 7.19 } else { \ 7.20 xen_l1_entry_update((__ptep), (__entry).pte_low); \
8.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h Tue Feb 08 12:27:23 2005 +0000 8.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h Tue Feb 08 15:13:51 2005 +0000 8.3 @@ -438,7 +438,7 @@ HYPERVISOR_multicall( 8.4 8.5 static inline int 8.6 HYPERVISOR_update_va_mapping( 8.7 - unsigned long page_nr, pte_t new_val, unsigned long flags) 8.8 + unsigned long nr, pte_t new_val, unsigned long flags) 8.9 { 8.10 int ret; 8.11 unsigned long ign1, ign2, ign3; 8.12 @@ -447,13 +447,13 @@ HYPERVISOR_update_va_mapping( 8.13 TRAP_INSTR 8.14 : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) 8.15 : "0" (__HYPERVISOR_update_va_mapping), 8.16 - "1" (page_nr), "2" ((new_val).pte_low), "3" (flags) 8.17 + "1" (va), "2" ((new_val).pte_low), "3" (flags) 8.18 : "memory" ); 8.19 8.20 if ( unlikely(ret < 0) ) 8.21 { 8.22 printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n", 8.23 - page_nr, (new_val).pte_low, flags); 8.24 + va, (new_val).pte_low, flags); 8.25 BUG(); 8.26 } 8.27 8.28 @@ -540,7 +540,7 @@ HYPERVISOR_grant_table_op( 8.29 8.30 static inline int 8.31 HYPERVISOR_update_va_mapping_otherdomain( 8.32 - unsigned long page_nr, pte_t new_val, unsigned long flags, domid_t domid) 8.33 + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) 8.34 { 8.35 int ret; 8.36 unsigned long ign1, ign2, ign3, ign4; 8.37 @@ -549,7 +549,7 @@ HYPERVISOR_update_va_mapping_otherdomain 8.38 TRAP_INSTR 8.39 : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) 8.40 : "0" (__HYPERVISOR_update_va_mapping_otherdomain), 8.41 - "1" (page_nr), "2" ((new_val).pte_low), "3" (flags), "4" (domid) : 8.42 + "1" (va), "2" ((new_val).pte_low), "3" (flags), "4" (domid) : 8.43 "memory" ); 8.44 8.45 return ret;
9.1 --- a/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h Tue Feb 08 12:27:23 2005 +0000 9.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h Tue Feb 08 15:13:51 2005 +0000 9.3 @@ -398,7 +398,7 @@ HYPERVISOR_multicall(void *call_list, in 9.4 } 9.5 9.6 static inline int 9.7 -HYPERVISOR_update_va_mapping(unsigned long page_nr, unsigned long new_val, 9.8 +HYPERVISOR_update_va_mapping(unsigned long va, unsigned long new_val, 9.9 unsigned long flags) 9.10 { 9.11 int ret; 9.12 @@ -408,12 +408,12 @@ HYPERVISOR_update_va_mapping(unsigned lo 9.13 TRAP_INSTR 9.14 : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) 9.15 : "0" (__HYPERVISOR_update_va_mapping), 9.16 - "1" (page_nr), "2" (new_val), "3" (flags) 9.17 + "1" (va), "2" (new_val), "3" (flags) 9.18 : "memory" ); 9.19 9.20 if (__predict_false(ret < 0)) 9.21 panic("Failed update VA mapping: %08lx, %08lx, %08lx", 9.22 - page_nr, new_val, flags); 9.23 + va, new_val, flags); 9.24 9.25 return ret; 9.26 } 9.27 @@ -494,7 +494,7 @@ HYPERVISOR_grant_table_op(unsigned int c 9.28 } 9.29 9.30 static inline int 9.31 -HYPERVISOR_update_va_mapping_otherdomain(unsigned long page_nr, 9.32 +HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, 9.33 unsigned long new_val, unsigned long flags, domid_t domid) 9.34 { 9.35 int ret; 9.36 @@ -504,7 +504,7 @@ HYPERVISOR_update_va_mapping_otherdomain 9.37 TRAP_INSTR 9.38 : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) 9.39 : "0" (__HYPERVISOR_update_va_mapping_otherdomain), 9.40 - "1" (page_nr), "2" (new_val), "3" (flags), "4" (domid) : 9.41 + "1" (va), "2" (new_val), "3" (flags), "4" (domid) : 9.42 "memory" ); 9.43 9.44 return ret;
10.1 --- a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c Tue Feb 08 12:27:23 2005 +0000 10.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c Tue Feb 08 15:13:51 2005 +0000 10.3 @@ -580,7 +580,7 @@ xennet_rx_push_buffer(struct xennet_soft 10.4 INVALID_P2M_ENTRY; 10.5 10.6 rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping; 10.7 - rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT; 10.8 + rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va; 10.9 rx_mcl[nr_pfns].args[1] = 0; 10.10 rx_mcl[nr_pfns].args[2] = 0; 10.11 10.12 @@ -679,7 +679,7 @@ xen_network_handler(void *arg) 10.13 mmu->val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT; 10.14 mmu++; 10.15 mcl->op = __HYPERVISOR_update_va_mapping; 10.16 - mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT; 10.17 + mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va; 10.18 mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW; 10.19 mcl->args[2] = UVMF_FLUSH_TLB; // 0; 10.20 mcl++; 10.21 @@ -872,7 +872,7 @@ network_alloc_rx_buffers(struct xennet_s 10.22 INVALID_P2M_ENTRY; 10.23 10.24 rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping; 10.25 - rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT; 10.26 + rx_mcl[nr_pfns].args[0] = va; 10.27 rx_mcl[nr_pfns].args[1] = 0; 10.28 rx_mcl[nr_pfns].args[2] = 0; 10.29
11.1 --- a/xen/arch/x86/memory.c Tue Feb 08 12:27:23 2005 +0000 11.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 11.3 @@ -1,2594 +0,0 @@ 11.4 -/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ 11.5 -/****************************************************************************** 11.6 - * arch/x86/memory.c 11.7 - * 11.8 - * Copyright (c) 2002-2004 K A Fraser 11.9 - * Copyright (c) 2004 Christian Limpach 11.10 - * 11.11 - * This program is free software; you can redistribute it and/or modify 11.12 - * it under the terms of the GNU General Public License as published by 11.13 - * the Free Software Foundation; either version 2 of the License, or 11.14 - * (at your option) any later version. 11.15 - * 11.16 - * This program is distributed in the hope that it will be useful, 11.17 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 11.18 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11.19 - * GNU General Public License for more details. 11.20 - * 11.21 - * You should have received a copy of the GNU General Public License 11.22 - * along with this program; if not, write to the Free Software 11.23 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 11.24 - */ 11.25 - 11.26 -/* 11.27 - * A description of the x86 page table API: 11.28 - * 11.29 - * Domains trap to do_mmu_update with a list of update requests. 11.30 - * This is a list of (ptr, val) pairs, where the requested operation 11.31 - * is *ptr = val. 11.32 - * 11.33 - * Reference counting of pages: 11.34 - * ---------------------------- 11.35 - * Each page has two refcounts: tot_count and type_count. 11.36 - * 11.37 - * TOT_COUNT is the obvious reference count. It counts all uses of a 11.38 - * physical page frame by a domain, including uses as a page directory, 11.39 - * a page table, or simple mappings via a PTE. This count prevents a 11.40 - * domain from releasing a frame back to the free pool when it still holds 11.41 - * a reference to it. 11.42 - * 11.43 - * TYPE_COUNT is more subtle. A frame can be put to one of three 11.44 - * mutually-exclusive uses: it might be used as a page directory, or a 11.45 - * page table, or it may be mapped writable by the domain [of course, a 11.46 - * frame may not be used in any of these three ways!]. 11.47 - * So, type_count is a count of the number of times a frame is being 11.48 - * referred to in its current incarnation. Therefore, a page can only 11.49 - * change its type when its type count is zero. 11.50 - * 11.51 - * Pinning the page type: 11.52 - * ---------------------- 11.53 - * The type of a page can be pinned/unpinned with the commands 11.54 - * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, 11.55 - * pinning is not reference counted, so it can't be nested). 11.56 - * This is useful to prevent a page's type count falling to zero, at which 11.57 - * point safety checks would need to be carried out next time the count 11.58 - * is increased again. 11.59 - * 11.60 - * A further note on writable page mappings: 11.61 - * ----------------------------------------- 11.62 - * For simplicity, the count of writable mappings for a page may not 11.63 - * correspond to reality. The 'writable count' is incremented for every 11.64 - * PTE which maps the page with the _PAGE_RW flag set. However, for 11.65 - * write access to be possible the page directory entry must also have 11.66 - * its _PAGE_RW bit set. We do not check this as it complicates the 11.67 - * reference counting considerably [consider the case of multiple 11.68 - * directory entries referencing a single page table, some with the RW 11.69 - * bit set, others not -- it starts getting a bit messy]. 11.70 - * In normal use, this simplification shouldn't be a problem. 11.71 - * However, the logic can be added if required. 11.72 - * 11.73 - * One more note on read-only page mappings: 11.74 - * ----------------------------------------- 11.75 - * We want domains to be able to map pages for read-only access. The 11.76 - * main reason is that page tables and directories should be readable 11.77 - * by a domain, but it would not be safe for them to be writable. 11.78 - * However, domains have free access to rings 1 & 2 of the Intel 11.79 - * privilege model. In terms of page protection, these are considered 11.80 - * to be part of 'supervisor mode'. The WP bit in CR0 controls whether 11.81 - * read-only restrictions are respected in supervisor mode -- if the 11.82 - * bit is clear then any mapped page is writable. 11.83 - * 11.84 - * We get round this by always setting the WP bit and disallowing 11.85 - * updates to it. This is very unlikely to cause a problem for guest 11.86 - * OS's, which will generally use the WP bit to simplify copy-on-write 11.87 - * implementation (in that case, OS wants a fault when it writes to 11.88 - * an application-supplied buffer). 11.89 - */ 11.90 - 11.91 -#include <xen/config.h> 11.92 -#include <xen/init.h> 11.93 -#include <xen/kernel.h> 11.94 -#include <xen/lib.h> 11.95 -#include <xen/mm.h> 11.96 -#include <xen/sched.h> 11.97 -#include <xen/errno.h> 11.98 -#include <xen/perfc.h> 11.99 -#include <xen/irq.h> 11.100 -#include <xen/softirq.h> 11.101 -#include <asm/shadow.h> 11.102 -#include <asm/page.h> 11.103 -#include <asm/flushtlb.h> 11.104 -#include <asm/io.h> 11.105 -#include <asm/uaccess.h> 11.106 -#include <asm/domain_page.h> 11.107 -#include <asm/ldt.h> 11.108 - 11.109 -#ifdef VERBOSE 11.110 -#define MEM_LOG(_f, _a...) \ 11.111 - printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \ 11.112 - current->domain->id , __LINE__ , ## _a ) 11.113 -#else 11.114 -#define MEM_LOG(_f, _a...) ((void)0) 11.115 -#endif 11.116 - 11.117 -static int alloc_l2_table(struct pfn_info *page); 11.118 -static int alloc_l1_table(struct pfn_info *page); 11.119 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d); 11.120 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 11.121 - u32 type, 11.122 - struct domain *d); 11.123 - 11.124 -static void free_l2_table(struct pfn_info *page); 11.125 -static void free_l1_table(struct pfn_info *page); 11.126 - 11.127 -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); 11.128 -static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); 11.129 - 11.130 -/* Used to defer flushing of memory structures. */ 11.131 -static struct { 11.132 -#define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */ 11.133 -#define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */ 11.134 - unsigned long deferred_ops; 11.135 - /* If non-NULL, specifies a foreign subject domain for some operations. */ 11.136 - struct domain *foreign; 11.137 -} __cacheline_aligned percpu_info[NR_CPUS]; 11.138 - 11.139 -/* 11.140 - * Returns the current foreign domain; defaults to the currently-executing 11.141 - * domain if a foreign override hasn't been specified. 11.142 - */ 11.143 -#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain) 11.144 - 11.145 -/* Private domain structs for DOMID_XEN and DOMID_IO. */ 11.146 -static struct domain *dom_xen, *dom_io; 11.147 - 11.148 -/* Frame table and its size in pages. */ 11.149 -struct pfn_info *frame_table; 11.150 -unsigned long frame_table_size; 11.151 -unsigned long max_page; 11.152 - 11.153 -void __init init_frametable(void) 11.154 -{ 11.155 - unsigned long i, p; 11.156 - 11.157 - frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START; 11.158 - frame_table_size = max_page * sizeof(struct pfn_info); 11.159 - frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; 11.160 - 11.161 - for ( i = 0; i < frame_table_size; i += (4UL << 20) ) 11.162 - { 11.163 - p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20); 11.164 - if ( p == 0 ) 11.165 - panic("Not enough memory for frame table\n"); 11.166 - map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 11.167 - 4UL << 20, PAGE_HYPERVISOR); 11.168 - } 11.169 - 11.170 - memset(frame_table, 0, frame_table_size); 11.171 -} 11.172 - 11.173 -void arch_init_memory(void) 11.174 -{ 11.175 - extern void subarch_init_memory(struct domain *); 11.176 - 11.177 - memset(percpu_info, 0, sizeof(percpu_info)); 11.178 - 11.179 - /* 11.180 - * Initialise our DOMID_XEN domain. 11.181 - * Any Xen-heap pages that we will allow to be mapped will have 11.182 - * their domain field set to dom_xen. 11.183 - */ 11.184 - dom_xen = alloc_domain_struct(); 11.185 - atomic_set(&dom_xen->refcnt, 1); 11.186 - dom_xen->id = DOMID_XEN; 11.187 - 11.188 - /* 11.189 - * Initialise our DOMID_IO domain. 11.190 - * This domain owns no pages but is considered a special case when 11.191 - * mapping I/O pages, as the mappings occur at the priv of the caller. 11.192 - */ 11.193 - dom_io = alloc_domain_struct(); 11.194 - atomic_set(&dom_io->refcnt, 1); 11.195 - dom_io->id = DOMID_IO; 11.196 - 11.197 - subarch_init_memory(dom_xen); 11.198 -} 11.199 - 11.200 -void write_ptbase(struct exec_domain *ed) 11.201 -{ 11.202 - struct domain *d = ed->domain; 11.203 - unsigned long pa; 11.204 - 11.205 -#ifdef CONFIG_VMX 11.206 - if ( unlikely(shadow_mode(d)) ) 11.207 - pa = ((shadow_mode(d) == SHM_full_32) ? 11.208 - pagetable_val(ed->arch.monitor_table) : 11.209 - pagetable_val(ed->arch.shadow_table)); 11.210 - else 11.211 - pa = pagetable_val(ed->arch.pagetable); 11.212 -#else 11.213 - if ( unlikely(shadow_mode(d)) ) 11.214 - pa = pagetable_val(ed->arch.shadow_table); 11.215 - else 11.216 - pa = pagetable_val(ed->arch.pagetable); 11.217 -#endif 11.218 - 11.219 - write_cr3(pa); 11.220 -} 11.221 - 11.222 -static void __invalidate_shadow_ldt(struct exec_domain *d) 11.223 -{ 11.224 - int i; 11.225 - unsigned long pfn; 11.226 - struct pfn_info *page; 11.227 - 11.228 - d->arch.shadow_ldt_mapcnt = 0; 11.229 - 11.230 - for ( i = 16; i < 32; i++ ) 11.231 - { 11.232 - pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]); 11.233 - if ( pfn == 0 ) continue; 11.234 - d->arch.perdomain_ptes[i] = mk_l1_pgentry(0); 11.235 - page = &frame_table[pfn]; 11.236 - ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page); 11.237 - ASSERT_PAGE_IS_DOMAIN(page, d->domain); 11.238 - put_page_and_type(page); 11.239 - } 11.240 - 11.241 - /* Dispose of the (now possibly invalid) mappings from the TLB. */ 11.242 - percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT; 11.243 -} 11.244 - 11.245 - 11.246 -static inline void invalidate_shadow_ldt(struct exec_domain *d) 11.247 -{ 11.248 - if ( d->arch.shadow_ldt_mapcnt != 0 ) 11.249 - __invalidate_shadow_ldt(d); 11.250 -} 11.251 - 11.252 - 11.253 -static int alloc_segdesc_page(struct pfn_info *page) 11.254 -{ 11.255 - struct desc_struct *descs; 11.256 - int i; 11.257 - 11.258 - descs = map_domain_mem((page-frame_table) << PAGE_SHIFT); 11.259 - 11.260 - for ( i = 0; i < 512; i++ ) 11.261 - if ( unlikely(!check_descriptor(&descs[i])) ) 11.262 - goto fail; 11.263 - 11.264 - unmap_domain_mem(descs); 11.265 - return 1; 11.266 - 11.267 - fail: 11.268 - unmap_domain_mem(descs); 11.269 - return 0; 11.270 -} 11.271 - 11.272 - 11.273 -/* Map shadow page at offset @off. */ 11.274 -int map_ldt_shadow_page(unsigned int off) 11.275 -{ 11.276 - struct exec_domain *ed = current; 11.277 - struct domain *d = ed->domain; 11.278 - unsigned long l1e; 11.279 - 11.280 - if ( unlikely(in_irq()) ) 11.281 - BUG(); 11.282 - 11.283 - __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->arch.ldt_base >> 11.284 - PAGE_SHIFT) + off]); 11.285 - 11.286 - if ( unlikely(!(l1e & _PAGE_PRESENT)) || 11.287 - unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 11.288 - d, PGT_ldt_page)) ) 11.289 - return 0; 11.290 - 11.291 - ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW); 11.292 - ed->arch.shadow_ldt_mapcnt++; 11.293 - 11.294 - return 1; 11.295 -} 11.296 - 11.297 - 11.298 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d) 11.299 -{ 11.300 - struct pfn_info *page = &frame_table[page_nr]; 11.301 - 11.302 - if ( unlikely(!pfn_is_ram(page_nr)) ) 11.303 - { 11.304 - MEM_LOG("Pfn %08lx is not RAM", page_nr); 11.305 - return 0; 11.306 - } 11.307 - 11.308 - if ( unlikely(!get_page(page, d)) ) 11.309 - { 11.310 - MEM_LOG("Could not get page ref for pfn %08lx", page_nr); 11.311 - return 0; 11.312 - } 11.313 - 11.314 - return 1; 11.315 -} 11.316 - 11.317 - 11.318 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 11.319 - u32 type, 11.320 - struct domain *d) 11.321 -{ 11.322 - struct pfn_info *page = &frame_table[page_nr]; 11.323 - 11.324 - if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) 11.325 - return 0; 11.326 - 11.327 - if ( unlikely(!get_page_type(page, type)) ) 11.328 - { 11.329 -#ifdef VERBOSE 11.330 - if ( (type & PGT_type_mask) != PGT_l1_page_table ) 11.331 - MEM_LOG("Bad page type for pfn %08lx (%08x)", 11.332 - page_nr, page->u.inuse.type_info); 11.333 -#endif 11.334 - put_page(page); 11.335 - return 0; 11.336 - } 11.337 - 11.338 - return 1; 11.339 -} 11.340 - 11.341 - 11.342 -/* 11.343 - * We allow an L2 tables to map each other (a.k.a. linear page tables). It 11.344 - * needs some special care with reference counst and access permissions: 11.345 - * 1. The mapping entry must be read-only, or the guest may get write access 11.346 - * to its own PTEs. 11.347 - * 2. We must only bump the reference counts for an *already validated* 11.348 - * L2 table, or we can end up in a deadlock in get_page_type() by waiting 11.349 - * on a validation that is required to complete that validation. 11.350 - * 3. We only need to increment the reference counts for the mapped page 11.351 - * frame if it is mapped by a different L2 table. This is sufficient and 11.352 - * also necessary to allow validation of an L2 table mapping itself. 11.353 - */ 11.354 -static int 11.355 -get_linear_pagetable( 11.356 - l2_pgentry_t l2e, unsigned long pfn, struct domain *d) 11.357 -{ 11.358 - u32 x, y; 11.359 - struct pfn_info *page; 11.360 - 11.361 - if ( (l2_pgentry_val(l2e) & _PAGE_RW) ) 11.362 - { 11.363 - MEM_LOG("Attempt to create linear p.t. with write perms"); 11.364 - return 0; 11.365 - } 11.366 - 11.367 - if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) 11.368 - { 11.369 - /* Make sure the mapped frame belongs to the correct domain. */ 11.370 - if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) ) 11.371 - return 0; 11.372 - 11.373 - /* 11.374 - * Make sure that the mapped frame is an already-validated L2 table. 11.375 - * If so, atomically increment the count (checking for overflow). 11.376 - */ 11.377 - page = &frame_table[l2_pgentry_to_pagenr(l2e)]; 11.378 - y = page->u.inuse.type_info; 11.379 - do { 11.380 - x = y; 11.381 - if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || 11.382 - unlikely((x & (PGT_type_mask|PGT_validated)) != 11.383 - (PGT_l2_page_table|PGT_validated)) ) 11.384 - { 11.385 - put_page(page); 11.386 - return 0; 11.387 - } 11.388 - } 11.389 - while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); 11.390 - } 11.391 - 11.392 - return 1; 11.393 -} 11.394 - 11.395 - 11.396 -static int 11.397 -get_page_from_l1e( 11.398 - l1_pgentry_t l1e, struct domain *d) 11.399 -{ 11.400 - unsigned long l1v = l1_pgentry_val(l1e); 11.401 - unsigned long pfn = l1_pgentry_to_pagenr(l1e); 11.402 - struct pfn_info *page = &frame_table[pfn]; 11.403 - extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn); 11.404 - 11.405 - if ( !(l1v & _PAGE_PRESENT) ) 11.406 - return 1; 11.407 - 11.408 - if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) ) 11.409 - { 11.410 - MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT)); 11.411 - return 0; 11.412 - } 11.413 - 11.414 - if ( unlikely(!pfn_is_ram(pfn)) ) 11.415 - { 11.416 - /* Revert to caller privileges if FD == DOMID_IO. */ 11.417 - if ( d == dom_io ) 11.418 - d = current->domain; 11.419 - 11.420 - if ( IS_PRIV(d) ) 11.421 - return 1; 11.422 - 11.423 - if ( IS_CAPABLE_PHYSDEV(d) ) 11.424 - return domain_iomem_in_pfn(d, pfn); 11.425 - 11.426 - MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn); 11.427 - return 0; 11.428 - } 11.429 - 11.430 - return ((l1v & _PAGE_RW) ? 11.431 - get_page_and_type(page, d, PGT_writable_page) : 11.432 - get_page(page, d)); 11.433 -} 11.434 - 11.435 - 11.436 -/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ 11.437 -static int 11.438 -get_page_from_l2e( 11.439 - l2_pgentry_t l2e, unsigned long pfn, 11.440 - struct domain *d, unsigned long va_idx) 11.441 -{ 11.442 - int rc; 11.443 - 11.444 - if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) 11.445 - return 1; 11.446 - 11.447 - if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) ) 11.448 - { 11.449 - MEM_LOG("Bad L2 page type settings %04lx", 11.450 - l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE)); 11.451 - return 0; 11.452 - } 11.453 - 11.454 - rc = get_page_and_type_from_pagenr( 11.455 - l2_pgentry_to_pagenr(l2e), 11.456 - PGT_l1_page_table | (va_idx<<PGT_va_shift), d); 11.457 - 11.458 - if ( unlikely(!rc) ) 11.459 - return get_linear_pagetable(l2e, pfn, d); 11.460 - 11.461 - return 1; 11.462 -} 11.463 - 11.464 - 11.465 -static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) 11.466 -{ 11.467 - unsigned long l1v = l1_pgentry_val(l1e); 11.468 - unsigned long pfn = l1_pgentry_to_pagenr(l1e); 11.469 - struct pfn_info *page = &frame_table[pfn]; 11.470 - struct domain *e; 11.471 - 11.472 - if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) ) 11.473 - return; 11.474 - 11.475 - e = page_get_owner(page); 11.476 - if ( unlikely(e != d) ) 11.477 - { 11.478 - /* 11.479 - * Unmap a foreign page that may have been mapped via a grant table. 11.480 - * Note that this can fail for a privileged domain that can map foreign 11.481 - * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings 11.482 - * counted via a grant entry and some counted directly in the page 11.483 - * structure's reference count. Note that reference counts won't get 11.484 - * dangerously confused as long as we always try to decrement the 11.485 - * grant entry first. We may end up with a mismatch between which 11.486 - * mappings and which unmappings are counted via the grant entry, but 11.487 - * really it doesn't matter as privileged domains have carte blanche. 11.488 - */ 11.489 - if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) ) 11.490 - return; 11.491 - /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */ 11.492 - } 11.493 - 11.494 - if ( l1v & _PAGE_RW ) 11.495 - { 11.496 - put_page_and_type(page); 11.497 - } 11.498 - else 11.499 - { 11.500 - /* We expect this is rare so we blow the entire shadow LDT. */ 11.501 - if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 11.502 - PGT_ldt_page)) && 11.503 - unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) ) 11.504 - invalidate_shadow_ldt(e->exec_domain[0]); 11.505 - put_page(page); 11.506 - } 11.507 -} 11.508 - 11.509 - 11.510 -/* 11.511 - * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. 11.512 - * Note also that this automatically deals correctly with linear p.t.'s. 11.513 - */ 11.514 -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) 11.515 -{ 11.516 - if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 11.517 - ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) ) 11.518 - put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]); 11.519 -} 11.520 - 11.521 - 11.522 -static int alloc_l2_table(struct pfn_info *page) 11.523 -{ 11.524 - struct domain *d = page_get_owner(page); 11.525 - unsigned long page_nr = page_to_pfn(page); 11.526 - l2_pgentry_t *pl2e; 11.527 - int i; 11.528 - 11.529 - pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 11.530 - 11.531 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 11.532 - if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) ) 11.533 - goto fail; 11.534 - 11.535 -#if defined(__i386__) 11.536 - /* Now we add our private high mappings. */ 11.537 - memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 11.538 - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 11.539 - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); 11.540 - pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = 11.541 - mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); 11.542 - pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = 11.543 - mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 11.544 - __PAGE_HYPERVISOR); 11.545 -#endif 11.546 - 11.547 - unmap_domain_mem(pl2e); 11.548 - return 1; 11.549 - 11.550 - fail: 11.551 - while ( i-- > 0 ) 11.552 - put_page_from_l2e(pl2e[i], page_nr); 11.553 - 11.554 - unmap_domain_mem(pl2e); 11.555 - return 0; 11.556 -} 11.557 - 11.558 - 11.559 -static int alloc_l1_table(struct pfn_info *page) 11.560 -{ 11.561 - struct domain *d = page_get_owner(page); 11.562 - unsigned long page_nr = page_to_pfn(page); 11.563 - l1_pgentry_t *pl1e; 11.564 - int i; 11.565 - 11.566 - pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 11.567 - 11.568 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 11.569 - if ( unlikely(!get_page_from_l1e(pl1e[i], d)) ) 11.570 - goto fail; 11.571 - 11.572 - unmap_domain_mem(pl1e); 11.573 - return 1; 11.574 - 11.575 - fail: 11.576 - while ( i-- > 0 ) 11.577 - put_page_from_l1e(pl1e[i], d); 11.578 - 11.579 - unmap_domain_mem(pl1e); 11.580 - return 0; 11.581 -} 11.582 - 11.583 - 11.584 -static void free_l2_table(struct pfn_info *page) 11.585 -{ 11.586 - unsigned long page_nr = page - frame_table; 11.587 - l2_pgentry_t *pl2e; 11.588 - int i; 11.589 - 11.590 - pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 11.591 - 11.592 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 11.593 - put_page_from_l2e(pl2e[i], page_nr); 11.594 - 11.595 - unmap_domain_mem(pl2e); 11.596 -} 11.597 - 11.598 - 11.599 -static void free_l1_table(struct pfn_info *page) 11.600 -{ 11.601 - struct domain *d = page_get_owner(page); 11.602 - unsigned long page_nr = page - frame_table; 11.603 - l1_pgentry_t *pl1e; 11.604 - int i; 11.605 - 11.606 - pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 11.607 - 11.608 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 11.609 - put_page_from_l1e(pl1e[i], d); 11.610 - 11.611 - unmap_domain_mem(pl1e); 11.612 -} 11.613 - 11.614 - 11.615 -static inline int update_l2e(l2_pgentry_t *pl2e, 11.616 - l2_pgentry_t ol2e, 11.617 - l2_pgentry_t nl2e) 11.618 -{ 11.619 - unsigned long o = cmpxchg((unsigned long *)pl2e, 11.620 - l2_pgentry_val(ol2e), 11.621 - l2_pgentry_val(nl2e)); 11.622 - if ( o != l2_pgentry_val(ol2e) ) 11.623 - MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 11.624 - l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o); 11.625 - return (o == l2_pgentry_val(ol2e)); 11.626 -} 11.627 - 11.628 - 11.629 -/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ 11.630 -static int mod_l2_entry(l2_pgentry_t *pl2e, 11.631 - l2_pgentry_t nl2e, 11.632 - unsigned long pfn) 11.633 -{ 11.634 - l2_pgentry_t ol2e; 11.635 - unsigned long _ol2e; 11.636 - 11.637 - if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >= 11.638 - DOMAIN_ENTRIES_PER_L2_PAGETABLE) ) 11.639 - { 11.640 - MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e); 11.641 - return 0; 11.642 - } 11.643 - 11.644 - if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) ) 11.645 - return 0; 11.646 - ol2e = mk_l2_pgentry(_ol2e); 11.647 - 11.648 - if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT ) 11.649 - { 11.650 - /* Differ in mapping (bits 12-31) or presence (bit 0)? */ 11.651 - if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 ) 11.652 - return update_l2e(pl2e, ol2e, nl2e); 11.653 - 11.654 - if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, 11.655 - ((unsigned long)pl2e & 11.656 - ~PAGE_MASK) >> 2)) ) 11.657 - return 0; 11.658 - 11.659 - if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 11.660 - { 11.661 - put_page_from_l2e(nl2e, pfn); 11.662 - return 0; 11.663 - } 11.664 - 11.665 - put_page_from_l2e(ol2e, pfn); 11.666 - return 1; 11.667 - } 11.668 - 11.669 - if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 11.670 - return 0; 11.671 - 11.672 - put_page_from_l2e(ol2e, pfn); 11.673 - return 1; 11.674 -} 11.675 - 11.676 - 11.677 -static inline int update_l1e(l1_pgentry_t *pl1e, 11.678 - l1_pgentry_t ol1e, 11.679 - l1_pgentry_t nl1e) 11.680 -{ 11.681 - unsigned long o = l1_pgentry_val(ol1e); 11.682 - unsigned long n = l1_pgentry_val(nl1e); 11.683 - 11.684 - if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) || 11.685 - unlikely(o != l1_pgentry_val(ol1e)) ) 11.686 - { 11.687 - MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 11.688 - l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o); 11.689 - return 0; 11.690 - } 11.691 - 11.692 - return 1; 11.693 -} 11.694 - 11.695 - 11.696 -/* Update the L1 entry at pl1e to new value nl1e. */ 11.697 -static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) 11.698 -{ 11.699 - l1_pgentry_t ol1e; 11.700 - unsigned long _ol1e; 11.701 - struct domain *d = current->domain; 11.702 - 11.703 - if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) 11.704 - { 11.705 - MEM_LOG("Bad get_user\n"); 11.706 - return 0; 11.707 - } 11.708 - 11.709 - ol1e = mk_l1_pgentry(_ol1e); 11.710 - 11.711 - if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT ) 11.712 - { 11.713 - /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */ 11.714 - if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 ) 11.715 - return update_l1e(pl1e, ol1e, nl1e); 11.716 - 11.717 - if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) 11.718 - return 0; 11.719 - 11.720 - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 11.721 - { 11.722 - put_page_from_l1e(nl1e, d); 11.723 - return 0; 11.724 - } 11.725 - 11.726 - put_page_from_l1e(ol1e, d); 11.727 - return 1; 11.728 - } 11.729 - 11.730 - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 11.731 - return 0; 11.732 - 11.733 - put_page_from_l1e(ol1e, d); 11.734 - return 1; 11.735 -} 11.736 - 11.737 - 11.738 -int alloc_page_type(struct pfn_info *page, unsigned int type) 11.739 -{ 11.740 - switch ( type ) 11.741 - { 11.742 - case PGT_l1_page_table: 11.743 - return alloc_l1_table(page); 11.744 - case PGT_l2_page_table: 11.745 - return alloc_l2_table(page); 11.746 - case PGT_gdt_page: 11.747 - case PGT_ldt_page: 11.748 - return alloc_segdesc_page(page); 11.749 - default: 11.750 - printk("Bad type in alloc_page_type %x t=%x c=%x\n", 11.751 - type, page->u.inuse.type_info, 11.752 - page->count_info); 11.753 - BUG(); 11.754 - } 11.755 - 11.756 - return 0; 11.757 -} 11.758 - 11.759 - 11.760 -void free_page_type(struct pfn_info *page, unsigned int type) 11.761 -{ 11.762 - struct domain *d = page_get_owner(page); 11.763 - 11.764 - switch ( type ) 11.765 - { 11.766 - case PGT_l1_page_table: 11.767 - free_l1_table(page); 11.768 - break; 11.769 - 11.770 - case PGT_l2_page_table: 11.771 - free_l2_table(page); 11.772 - break; 11.773 - 11.774 - default: 11.775 - BUG(); 11.776 - } 11.777 - 11.778 - if ( unlikely(shadow_mode(d)) && 11.779 - (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) ) 11.780 - { 11.781 - unshadow_table(page_to_pfn(page), type); 11.782 - put_shadow_status(d); 11.783 - } 11.784 -} 11.785 - 11.786 - 11.787 -void put_page_type(struct pfn_info *page) 11.788 -{ 11.789 - u32 nx, x, y = page->u.inuse.type_info; 11.790 - 11.791 - again: 11.792 - do { 11.793 - x = y; 11.794 - nx = x - 1; 11.795 - 11.796 - ASSERT((x & PGT_count_mask) != 0); 11.797 - 11.798 - /* 11.799 - * The page should always be validated while a reference is held. The 11.800 - * exception is during domain destruction, when we forcibly invalidate 11.801 - * page-table pages if we detect a referential loop. 11.802 - * See domain.c:relinquish_list(). 11.803 - */ 11.804 - ASSERT((x & PGT_validated) || 11.805 - test_bit(DF_DYING, &page_get_owner(page)->d_flags)); 11.806 - 11.807 - if ( unlikely((nx & PGT_count_mask) == 0) ) 11.808 - { 11.809 - /* Record TLB information for flush later. Races are harmless. */ 11.810 - page->tlbflush_timestamp = tlbflush_current_time(); 11.811 - 11.812 - if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && 11.813 - likely(nx & PGT_validated) ) 11.814 - { 11.815 - /* 11.816 - * Page-table pages must be unvalidated when count is zero. The 11.817 - * 'free' is safe because the refcnt is non-zero and validated 11.818 - * bit is clear => other ops will spin or fail. 11.819 - */ 11.820 - if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 11.821 - x & ~PGT_validated)) != x) ) 11.822 - goto again; 11.823 - /* We cleared the 'valid bit' so we do the clear up. */ 11.824 - free_page_type(page, x & PGT_type_mask); 11.825 - /* Carry on, but with the 'valid bit' now clear. */ 11.826 - x &= ~PGT_validated; 11.827 - nx &= ~PGT_validated; 11.828 - } 11.829 - } 11.830 - else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 11.831 - (PGT_pinned | 1)) ) 11.832 - { 11.833 - /* Page is now only pinned. Make the back pointer mutable again. */ 11.834 - nx |= PGT_va_mutable; 11.835 - } 11.836 - } 11.837 - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 11.838 -} 11.839 - 11.840 - 11.841 -int get_page_type(struct pfn_info *page, u32 type) 11.842 -{ 11.843 - u32 nx, x, y = page->u.inuse.type_info; 11.844 - 11.845 - again: 11.846 - do { 11.847 - x = y; 11.848 - nx = x + 1; 11.849 - if ( unlikely((nx & PGT_count_mask) == 0) ) 11.850 - { 11.851 - MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page)); 11.852 - return 0; 11.853 - } 11.854 - else if ( unlikely((x & PGT_count_mask) == 0) ) 11.855 - { 11.856 - if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) 11.857 - { 11.858 - /* 11.859 - * On type change we check to flush stale TLB entries. This 11.860 - * may be unnecessary (e.g., page was GDT/LDT) but those 11.861 - * circumstances should be very rare. 11.862 - */ 11.863 - struct domain *d = page_get_owner(page); 11.864 - if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor], 11.865 - page->tlbflush_timestamp)) ) 11.866 - { 11.867 - perfc_incr(need_flush_tlb_flush); 11.868 - flush_tlb_cpu(d->exec_domain[0]->processor); 11.869 - } 11.870 - 11.871 - /* We lose existing type, back pointer, and validity. */ 11.872 - nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); 11.873 - nx |= type; 11.874 - 11.875 - /* No special validation needed for writable pages. */ 11.876 - /* Page tables and GDT/LDT need to be scanned for validity. */ 11.877 - if ( type == PGT_writable_page ) 11.878 - nx |= PGT_validated; 11.879 - } 11.880 - } 11.881 - else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) 11.882 - { 11.883 - if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) 11.884 - { 11.885 - if ( ((x & PGT_type_mask) != PGT_l2_page_table) || 11.886 - ((type & PGT_type_mask) != PGT_l1_page_table) ) 11.887 - MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n", 11.888 - x & PGT_type_mask, type, page_to_pfn(page)); 11.889 - return 0; 11.890 - } 11.891 - else if ( (x & PGT_va_mask) == PGT_va_mutable ) 11.892 - { 11.893 - /* The va backpointer is mutable, hence we update it. */ 11.894 - nx &= ~PGT_va_mask; 11.895 - nx |= type; /* we know the actual type is correct */ 11.896 - } 11.897 - else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) ) 11.898 - { 11.899 - /* This table is potentially mapped at multiple locations. */ 11.900 - nx &= ~PGT_va_mask; 11.901 - nx |= PGT_va_unknown; 11.902 - } 11.903 - } 11.904 - else if ( unlikely(!(x & PGT_validated)) ) 11.905 - { 11.906 - /* Someone else is updating validation of this page. Wait... */ 11.907 - while ( (y = page->u.inuse.type_info) == x ) 11.908 - { 11.909 - rep_nop(); 11.910 - barrier(); 11.911 - } 11.912 - goto again; 11.913 - } 11.914 - } 11.915 - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 11.916 - 11.917 - if ( unlikely(!(nx & PGT_validated)) ) 11.918 - { 11.919 - /* Try to validate page type; drop the new reference on failure. */ 11.920 - if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) 11.921 - { 11.922 - MEM_LOG("Error while validating pfn %08lx for type %08x." 11.923 - " caf=%08x taf=%08x\n", 11.924 - page_to_pfn(page), type, 11.925 - page->count_info, 11.926 - page->u.inuse.type_info); 11.927 - /* Noone else can get a reference. We hold the only ref. */ 11.928 - page->u.inuse.type_info = 0; 11.929 - return 0; 11.930 - } 11.931 - 11.932 - /* Noone else is updating simultaneously. */ 11.933 - __set_bit(_PGT_validated, &page->u.inuse.type_info); 11.934 - } 11.935 - 11.936 - return 1; 11.937 -} 11.938 - 11.939 - 11.940 -int new_guest_cr3(unsigned long pfn) 11.941 -{ 11.942 - struct exec_domain *ed = current; 11.943 - struct domain *d = ed->domain; 11.944 - int okay, cpu = smp_processor_id(); 11.945 - unsigned long old_base_pfn; 11.946 - 11.947 - okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d); 11.948 - if ( likely(okay) ) 11.949 - { 11.950 - invalidate_shadow_ldt(ed); 11.951 - 11.952 - percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB; 11.953 - old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT; 11.954 - ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT); 11.955 - 11.956 - shadow_mk_pagetable(ed); 11.957 - 11.958 - write_ptbase(ed); 11.959 - 11.960 - put_page_and_type(&frame_table[old_base_pfn]); 11.961 - } 11.962 - else 11.963 - { 11.964 - MEM_LOG("Error while installing new baseptr %08lx", pfn); 11.965 - } 11.966 - 11.967 - return okay; 11.968 -} 11.969 - 11.970 -static int do_extended_command(unsigned long ptr, unsigned long val) 11.971 -{ 11.972 - int okay = 1, cpu = smp_processor_id(); 11.973 - unsigned int cmd = val & MMUEXT_CMD_MASK; 11.974 - unsigned long pfn = ptr >> PAGE_SHIFT; 11.975 - struct pfn_info *page = &frame_table[pfn]; 11.976 - struct exec_domain *ed = current; 11.977 - struct domain *d = ed->domain, *nd, *e; 11.978 - u32 x, y; 11.979 - domid_t domid; 11.980 - grant_ref_t gntref; 11.981 - 11.982 - switch ( cmd ) 11.983 - { 11.984 - case MMUEXT_PIN_L1_TABLE: 11.985 - case MMUEXT_PIN_L2_TABLE: 11.986 - /* 11.987 - * We insist that, if you pin an L1 page, it's the first thing that 11.988 - * you do to it. This is because we require the backptr to still be 11.989 - * mutable. This assumption seems safe. 11.990 - */ 11.991 - okay = get_page_and_type_from_pagenr( 11.992 - pfn, 11.993 - ((cmd==MMUEXT_PIN_L2_TABLE) ? 11.994 - PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)), 11.995 - FOREIGNDOM); 11.996 - 11.997 - if ( unlikely(!okay) ) 11.998 - { 11.999 - MEM_LOG("Error while pinning pfn %08lx", pfn); 11.1000 - break; 11.1001 - } 11.1002 - 11.1003 - if ( unlikely(test_and_set_bit(_PGT_pinned, 11.1004 - &page->u.inuse.type_info)) ) 11.1005 - { 11.1006 - MEM_LOG("Pfn %08lx already pinned", pfn); 11.1007 - put_page_and_type(page); 11.1008 - okay = 0; 11.1009 - break; 11.1010 - } 11.1011 - 11.1012 - break; 11.1013 - 11.1014 - case MMUEXT_UNPIN_TABLE: 11.1015 - if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) ) 11.1016 - { 11.1017 - MEM_LOG("Page %08lx bad domain (dom=%p)", 11.1018 - ptr, page_get_owner(page)); 11.1019 - } 11.1020 - else if ( likely(test_and_clear_bit(_PGT_pinned, 11.1021 - &page->u.inuse.type_info)) ) 11.1022 - { 11.1023 - put_page_and_type(page); 11.1024 - put_page(page); 11.1025 - } 11.1026 - else 11.1027 - { 11.1028 - okay = 0; 11.1029 - put_page(page); 11.1030 - MEM_LOG("Pfn %08lx not pinned", pfn); 11.1031 - } 11.1032 - break; 11.1033 - 11.1034 - case MMUEXT_NEW_BASEPTR: 11.1035 - okay = new_guest_cr3(pfn); 11.1036 - break; 11.1037 - 11.1038 - case MMUEXT_TLB_FLUSH: 11.1039 - percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB; 11.1040 - break; 11.1041 - 11.1042 - case MMUEXT_INVLPG: 11.1043 - __flush_tlb_one(ptr); 11.1044 - break; 11.1045 - 11.1046 - case MMUEXT_FLUSH_CACHE: 11.1047 - if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) ) 11.1048 - { 11.1049 - MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n"); 11.1050 - okay = 0; 11.1051 - } 11.1052 - else 11.1053 - { 11.1054 - wbinvd(); 11.1055 - } 11.1056 - break; 11.1057 - 11.1058 - case MMUEXT_SET_LDT: 11.1059 - { 11.1060 - unsigned long ents = val >> MMUEXT_CMD_SHIFT; 11.1061 - if ( ((ptr & (PAGE_SIZE-1)) != 0) || 11.1062 - (ents > 8192) || 11.1063 - ((ptr+ents*LDT_ENTRY_SIZE) < ptr) || 11.1064 - ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) ) 11.1065 - { 11.1066 - okay = 0; 11.1067 - MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents); 11.1068 - } 11.1069 - else if ( (ed->arch.ldt_ents != ents) || 11.1070 - (ed->arch.ldt_base != ptr) ) 11.1071 - { 11.1072 - invalidate_shadow_ldt(ed); 11.1073 - ed->arch.ldt_base = ptr; 11.1074 - ed->arch.ldt_ents = ents; 11.1075 - load_LDT(ed); 11.1076 - percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT; 11.1077 - if ( ents != 0 ) 11.1078 - percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT; 11.1079 - } 11.1080 - break; 11.1081 - } 11.1082 - 11.1083 - case MMUEXT_SET_FOREIGNDOM: 11.1084 - domid = (domid_t)(val >> 16); 11.1085 - 11.1086 - if ( (e = percpu_info[cpu].foreign) != NULL ) 11.1087 - put_domain(e); 11.1088 - percpu_info[cpu].foreign = NULL; 11.1089 - 11.1090 - if ( !IS_PRIV(d) ) 11.1091 - { 11.1092 - switch ( domid ) 11.1093 - { 11.1094 - case DOMID_IO: 11.1095 - get_knownalive_domain(dom_io); 11.1096 - percpu_info[cpu].foreign = dom_io; 11.1097 - break; 11.1098 - default: 11.1099 - MEM_LOG("Dom %u cannot set foreign dom\n", d->id); 11.1100 - okay = 0; 11.1101 - break; 11.1102 - } 11.1103 - } 11.1104 - else 11.1105 - { 11.1106 - percpu_info[cpu].foreign = e = find_domain_by_id(domid); 11.1107 - if ( e == NULL ) 11.1108 - { 11.1109 - switch ( domid ) 11.1110 - { 11.1111 - case DOMID_XEN: 11.1112 - get_knownalive_domain(dom_xen); 11.1113 - percpu_info[cpu].foreign = dom_xen; 11.1114 - break; 11.1115 - case DOMID_IO: 11.1116 - get_knownalive_domain(dom_io); 11.1117 - percpu_info[cpu].foreign = dom_io; 11.1118 - break; 11.1119 - default: 11.1120 - MEM_LOG("Unknown domain '%u'", domid); 11.1121 - okay = 0; 11.1122 - break; 11.1123 - } 11.1124 - } 11.1125 - } 11.1126 - break; 11.1127 - 11.1128 - case MMUEXT_TRANSFER_PAGE: 11.1129 - domid = (domid_t)(val >> 16); 11.1130 - gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF)); 11.1131 - 11.1132 - if ( unlikely(IS_XEN_HEAP_FRAME(page)) || 11.1133 - unlikely(!pfn_is_ram(pfn)) || 11.1134 - unlikely((e = find_domain_by_id(domid)) == NULL) ) 11.1135 - { 11.1136 - MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid); 11.1137 - okay = 0; 11.1138 - break; 11.1139 - } 11.1140 - 11.1141 - spin_lock(&d->page_alloc_lock); 11.1142 - 11.1143 - /* 11.1144 - * The tricky bit: atomically release ownership while there is just one 11.1145 - * benign reference to the page (PGC_allocated). If that reference 11.1146 - * disappears then the deallocation routine will safely spin. 11.1147 - */ 11.1148 - nd = page_get_owner(page); 11.1149 - y = page->count_info; 11.1150 - do { 11.1151 - x = y; 11.1152 - if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 11.1153 - (1|PGC_allocated)) || 11.1154 - unlikely(nd != d) ) 11.1155 - { 11.1156 - MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," 11.1157 - " caf=%08x, taf=%08x\n", page_to_pfn(page), 11.1158 - d, d->id, nd, x, page->u.inuse.type_info); 11.1159 - spin_unlock(&d->page_alloc_lock); 11.1160 - put_domain(e); 11.1161 - return 0; 11.1162 - } 11.1163 - __asm__ __volatile__( 11.1164 - LOCK_PREFIX "cmpxchg8b %2" 11.1165 - : "=d" (nd), "=a" (y), 11.1166 - "=m" (*(volatile u64 *)(&page->count_info)) 11.1167 - : "0" (d), "1" (x), "c" (NULL), "b" (x) ); 11.1168 - } 11.1169 - while ( unlikely(nd != d) || unlikely(y != x) ); 11.1170 - 11.1171 - /* 11.1172 - * Unlink from 'd'. At least one reference remains (now anonymous), so 11.1173 - * noone else is spinning to try to delete this page from 'd'. 11.1174 - */ 11.1175 - d->tot_pages--; 11.1176 - list_del(&page->list); 11.1177 - 11.1178 - spin_unlock(&d->page_alloc_lock); 11.1179 - 11.1180 - spin_lock(&e->page_alloc_lock); 11.1181 - 11.1182 - /* 11.1183 - * Check that 'e' will accept the page and has reservation headroom. 11.1184 - * Also, a domain mustn't have PGC_allocated pages when it is dying. 11.1185 - */ 11.1186 - ASSERT(e->tot_pages <= e->max_pages); 11.1187 - if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || 11.1188 - unlikely(e->tot_pages == e->max_pages) || 11.1189 - unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) ) 11.1190 - { 11.1191 - MEM_LOG("Transferee has no reservation headroom (%d,%d), or " 11.1192 - "provided a bad grant ref, or is dying (%08lx).\n", 11.1193 - e->tot_pages, e->max_pages, e->d_flags); 11.1194 - spin_unlock(&e->page_alloc_lock); 11.1195 - put_domain(e); 11.1196 - okay = 0; 11.1197 - break; 11.1198 - } 11.1199 - 11.1200 - /* Okay, add the page to 'e'. */ 11.1201 - if ( unlikely(e->tot_pages++ == 0) ) 11.1202 - get_knownalive_domain(e); 11.1203 - list_add_tail(&page->list, &e->page_list); 11.1204 - page_set_owner(page, e); 11.1205 - 11.1206 - spin_unlock(&e->page_alloc_lock); 11.1207 - 11.1208 - /* Transfer is all done: tell the guest about its new page frame. */ 11.1209 - gnttab_notify_transfer(e, gntref, pfn); 11.1210 - 11.1211 - put_domain(e); 11.1212 - break; 11.1213 - 11.1214 - case MMUEXT_REASSIGN_PAGE: 11.1215 - if ( unlikely(!IS_PRIV(d)) ) 11.1216 - { 11.1217 - MEM_LOG("Dom %u has no reassignment priv", d->id); 11.1218 - okay = 0; 11.1219 - break; 11.1220 - } 11.1221 - 11.1222 - e = percpu_info[cpu].foreign; 11.1223 - if ( unlikely(e == NULL) ) 11.1224 - { 11.1225 - MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn); 11.1226 - okay = 0; 11.1227 - break; 11.1228 - } 11.1229 - 11.1230 - /* 11.1231 - * Grab both page_list locks, in order. This prevents the page from 11.1232 - * disappearing elsewhere while we modify the owner, and we'll need 11.1233 - * both locks if we're successful so that we can change lists. 11.1234 - */ 11.1235 - if ( d < e ) 11.1236 - { 11.1237 - spin_lock(&d->page_alloc_lock); 11.1238 - spin_lock(&e->page_alloc_lock); 11.1239 - } 11.1240 - else 11.1241 - { 11.1242 - spin_lock(&e->page_alloc_lock); 11.1243 - spin_lock(&d->page_alloc_lock); 11.1244 - } 11.1245 - 11.1246 - /* A domain shouldn't have PGC_allocated pages when it is dying. */ 11.1247 - if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || 11.1248 - unlikely(IS_XEN_HEAP_FRAME(page)) ) 11.1249 - { 11.1250 - MEM_LOG("Reassignment page is Xen heap, or dest dom is dying."); 11.1251 - okay = 0; 11.1252 - goto reassign_fail; 11.1253 - } 11.1254 - 11.1255 - /* 11.1256 - * The tricky bit: atomically change owner while there is just one 11.1257 - * benign reference to the page (PGC_allocated). If that reference 11.1258 - * disappears then the deallocation routine will safely spin. 11.1259 - */ 11.1260 - nd = page_get_owner(page); 11.1261 - y = page->count_info; 11.1262 - do { 11.1263 - x = y; 11.1264 - if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 11.1265 - (1|PGC_allocated)) || 11.1266 - unlikely(nd != d) ) 11.1267 - { 11.1268 - MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," 11.1269 - " caf=%08x, taf=%08x\n", page_to_pfn(page), 11.1270 - d, d->id, nd, x, page->u.inuse.type_info); 11.1271 - okay = 0; 11.1272 - goto reassign_fail; 11.1273 - } 11.1274 - __asm__ __volatile__( 11.1275 - LOCK_PREFIX "cmpxchg8b %3" 11.1276 - : "=d" (nd), "=a" (y), "=c" (e), 11.1277 - "=m" (*(volatile u64 *)(&page->count_info)) 11.1278 - : "0" (d), "1" (x), "c" (e), "b" (x) ); 11.1279 - } 11.1280 - while ( unlikely(nd != d) || unlikely(y != x) ); 11.1281 - 11.1282 - /* 11.1283 - * Unlink from 'd'. We transferred at least one reference to 'e', so 11.1284 - * noone else is spinning to try to delete this page from 'd'. 11.1285 - */ 11.1286 - d->tot_pages--; 11.1287 - list_del(&page->list); 11.1288 - 11.1289 - /* 11.1290 - * Add the page to 'e'. Someone may already have removed the last 11.1291 - * reference and want to remove the page from 'e'. However, we have 11.1292 - * the lock so they'll spin waiting for us. 11.1293 - */ 11.1294 - if ( unlikely(e->tot_pages++ == 0) ) 11.1295 - get_knownalive_domain(e); 11.1296 - list_add_tail(&page->list, &e->page_list); 11.1297 - 11.1298 - reassign_fail: 11.1299 - spin_unlock(&d->page_alloc_lock); 11.1300 - spin_unlock(&e->page_alloc_lock); 11.1301 - break; 11.1302 - 11.1303 - case MMUEXT_CLEAR_FOREIGNDOM: 11.1304 - if ( (e = percpu_info[cpu].foreign) != NULL ) 11.1305 - put_domain(e); 11.1306 - percpu_info[cpu].foreign = NULL; 11.1307 - break; 11.1308 - 11.1309 - default: 11.1310 - MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK); 11.1311 - okay = 0; 11.1312 - break; 11.1313 - } 11.1314 - 11.1315 - return okay; 11.1316 -} 11.1317 - 11.1318 -int do_mmu_update( 11.1319 - mmu_update_t *ureqs, unsigned int count, unsigned int *pdone) 11.1320 -{ 11.1321 -/* 11.1322 - * We steal the m.s.b. of the @count parameter to indicate whether this 11.1323 - * invocation of do_mmu_update() is resuming a previously preempted call. 11.1324 - * We steal the next 15 bits to remember the current FOREIGNDOM. 11.1325 - */ 11.1326 -#define MMU_UPDATE_PREEMPTED (~(~0U>>1)) 11.1327 -#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16) 11.1328 -#define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT) 11.1329 - 11.1330 - mmu_update_t req; 11.1331 - unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0; 11.1332 - struct pfn_info *page; 11.1333 - int rc = 0, okay = 1, i = 0, cpu = smp_processor_id(); 11.1334 - unsigned int cmd, done = 0; 11.1335 - unsigned long prev_smfn = 0; 11.1336 - l1_pgentry_t *prev_spl1e = 0; 11.1337 - struct exec_domain *ed = current; 11.1338 - struct domain *d = ed->domain; 11.1339 - u32 type_info; 11.1340 - domid_t domid; 11.1341 - 11.1342 - LOCK_BIGLOCK(d); 11.1343 - 11.1344 - cleanup_writable_pagetable(d); 11.1345 - 11.1346 - if ( unlikely(shadow_mode(d)) ) 11.1347 - check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */ 11.1348 - 11.1349 - /* 11.1350 - * If we are resuming after preemption, read how much work we have already 11.1351 - * done. This allows us to set the @done output parameter correctly. 11.1352 - * We also reset FOREIGNDOM here. 11.1353 - */ 11.1354 - if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) ) 11.1355 - { 11.1356 - if ( !(count & MMU_UPDATE_PREEMPTED) ) 11.1357 - { 11.1358 - /* Count overflow into private FOREIGNDOM field. */ 11.1359 - MEM_LOG("do_mmu_update count is too large"); 11.1360 - rc = -EINVAL; 11.1361 - goto out; 11.1362 - } 11.1363 - count &= ~MMU_UPDATE_PREEMPTED; 11.1364 - domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT; 11.1365 - count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK; 11.1366 - if ( unlikely(pdone != NULL) ) 11.1367 - (void)get_user(done, pdone); 11.1368 - if ( (domid != current->domain->id) && 11.1369 - !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) ) 11.1370 - { 11.1371 - rc = -EINVAL; 11.1372 - goto out; 11.1373 - } 11.1374 - } 11.1375 - 11.1376 - perfc_incrc(calls_to_mmu_update); 11.1377 - perfc_addc(num_page_updates, count); 11.1378 - 11.1379 - if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) ) 11.1380 - { 11.1381 - rc = -EFAULT; 11.1382 - goto out; 11.1383 - } 11.1384 - 11.1385 - for ( i = 0; i < count; i++ ) 11.1386 - { 11.1387 - if ( hypercall_preempt_check() ) 11.1388 - { 11.1389 - rc = hypercall3_create_continuation( 11.1390 - __HYPERVISOR_mmu_update, ureqs, 11.1391 - (count - i) | 11.1392 - (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 11.1393 - MMU_UPDATE_PREEMPTED, pdone); 11.1394 - break; 11.1395 - } 11.1396 - 11.1397 - if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) ) 11.1398 - { 11.1399 - MEM_LOG("Bad __copy_from_user"); 11.1400 - rc = -EFAULT; 11.1401 - break; 11.1402 - } 11.1403 - 11.1404 - cmd = req.ptr & (sizeof(l1_pgentry_t)-1); 11.1405 - pfn = req.ptr >> PAGE_SHIFT; 11.1406 - 11.1407 - okay = 0; 11.1408 - 11.1409 - switch ( cmd ) 11.1410 - { 11.1411 - /* 11.1412 - * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. 11.1413 - */ 11.1414 - case MMU_NORMAL_PT_UPDATE: 11.1415 - if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) ) 11.1416 - { 11.1417 - MEM_LOG("Could not get page for normal update"); 11.1418 - break; 11.1419 - } 11.1420 - 11.1421 - if ( likely(prev_pfn == pfn) ) 11.1422 - { 11.1423 - va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); 11.1424 - } 11.1425 - else 11.1426 - { 11.1427 - if ( prev_pfn != 0 ) 11.1428 - unmap_domain_mem((void *)va); 11.1429 - va = (unsigned long)map_domain_mem(req.ptr); 11.1430 - prev_pfn = pfn; 11.1431 - } 11.1432 - 11.1433 - page = &frame_table[pfn]; 11.1434 - switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) 11.1435 - { 11.1436 - case PGT_l1_page_table: 11.1437 - if ( likely(get_page_type( 11.1438 - page, type_info & (PGT_type_mask|PGT_va_mask))) ) 11.1439 - { 11.1440 - okay = mod_l1_entry((l1_pgentry_t *)va, 11.1441 - mk_l1_pgentry(req.val)); 11.1442 - 11.1443 - if ( unlikely(shadow_mode(d)) && okay && 11.1444 - (get_shadow_status(d, page-frame_table) & 11.1445 - PSH_shadowed) ) 11.1446 - { 11.1447 - shadow_l1_normal_pt_update( 11.1448 - req.ptr, req.val, &prev_smfn, &prev_spl1e); 11.1449 - put_shadow_status(d); 11.1450 - } 11.1451 - 11.1452 - put_page_type(page); 11.1453 - } 11.1454 - break; 11.1455 - case PGT_l2_page_table: 11.1456 - if ( likely(get_page_type(page, PGT_l2_page_table)) ) 11.1457 - { 11.1458 - okay = mod_l2_entry((l2_pgentry_t *)va, 11.1459 - mk_l2_pgentry(req.val), 11.1460 - pfn); 11.1461 - 11.1462 - if ( unlikely(shadow_mode(d)) && okay && 11.1463 - (get_shadow_status(d, page-frame_table) & 11.1464 - PSH_shadowed) ) 11.1465 - { 11.1466 - shadow_l2_normal_pt_update(req.ptr, req.val); 11.1467 - put_shadow_status(d); 11.1468 - } 11.1469 - 11.1470 - put_page_type(page); 11.1471 - } 11.1472 - break; 11.1473 - default: 11.1474 - if ( likely(get_page_type(page, PGT_writable_page)) ) 11.1475 - { 11.1476 - *(unsigned long *)va = req.val; 11.1477 - okay = 1; 11.1478 - put_page_type(page); 11.1479 - } 11.1480 - break; 11.1481 - } 11.1482 - 11.1483 - put_page(page); 11.1484 - break; 11.1485 - 11.1486 - case MMU_MACHPHYS_UPDATE: 11.1487 - if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) ) 11.1488 - { 11.1489 - MEM_LOG("Could not get page for mach->phys update"); 11.1490 - break; 11.1491 - } 11.1492 - 11.1493 - machine_to_phys_mapping[pfn] = req.val; 11.1494 - okay = 1; 11.1495 - 11.1496 - /* 11.1497 - * If in log-dirty mode, mark the corresponding pseudo-physical 11.1498 - * page as dirty. 11.1499 - */ 11.1500 - if ( unlikely(shadow_mode(d) == SHM_logdirty) && 11.1501 - mark_dirty(d, pfn) ) 11.1502 - d->arch.shadow_dirty_block_count++; 11.1503 - 11.1504 - put_page(&frame_table[pfn]); 11.1505 - break; 11.1506 - 11.1507 - /* 11.1508 - * MMU_EXTENDED_COMMAND: Extended command is specified 11.1509 - * in the least-siginificant bits of the 'value' field. 11.1510 - */ 11.1511 - case MMU_EXTENDED_COMMAND: 11.1512 - req.ptr &= ~(sizeof(l1_pgentry_t) - 1); 11.1513 - okay = do_extended_command(req.ptr, req.val); 11.1514 - break; 11.1515 - 11.1516 - default: 11.1517 - MEM_LOG("Invalid page update command %08lx", req.ptr); 11.1518 - break; 11.1519 - } 11.1520 - 11.1521 - if ( unlikely(!okay) ) 11.1522 - { 11.1523 - rc = -EINVAL; 11.1524 - break; 11.1525 - } 11.1526 - 11.1527 - ureqs++; 11.1528 - } 11.1529 - 11.1530 - out: 11.1531 - if ( prev_pfn != 0 ) 11.1532 - unmap_domain_mem((void *)va); 11.1533 - 11.1534 - if ( unlikely(prev_spl1e != 0) ) 11.1535 - unmap_domain_mem((void *)prev_spl1e); 11.1536 - 11.1537 - deferred_ops = percpu_info[cpu].deferred_ops; 11.1538 - percpu_info[cpu].deferred_ops = 0; 11.1539 - 11.1540 - if ( deferred_ops & DOP_FLUSH_TLB ) 11.1541 - local_flush_tlb(); 11.1542 - 11.1543 - if ( deferred_ops & DOP_RELOAD_LDT ) 11.1544 - (void)map_ldt_shadow_page(0); 11.1545 - 11.1546 - if ( unlikely(percpu_info[cpu].foreign != NULL) ) 11.1547 - { 11.1548 - put_domain(percpu_info[cpu].foreign); 11.1549 - percpu_info[cpu].foreign = NULL; 11.1550 - } 11.1551 - 11.1552 - /* Add incremental work we have done to the @done output parameter. */ 11.1553 - if ( unlikely(pdone != NULL) ) 11.1554 - __put_user(done + i, pdone); 11.1555 - 11.1556 - if ( unlikely(shadow_mode(d)) ) 11.1557 - check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */ 11.1558 - 11.1559 - UNLOCK_BIGLOCK(d); 11.1560 - return rc; 11.1561 -} 11.1562 - 11.1563 - 11.1564 -int do_update_va_mapping(unsigned long page_nr, 11.1565 - unsigned long val, 11.1566 - unsigned long flags) 11.1567 -{ 11.1568 - struct exec_domain *ed = current; 11.1569 - struct domain *d = ed->domain; 11.1570 - int err = 0; 11.1571 - unsigned int cpu = ed->processor; 11.1572 - unsigned long deferred_ops; 11.1573 - 11.1574 - perfc_incrc(calls_to_update_va); 11.1575 - 11.1576 - if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) ) 11.1577 - return -EINVAL; 11.1578 - 11.1579 - LOCK_BIGLOCK(d); 11.1580 - 11.1581 - cleanup_writable_pagetable(d); 11.1582 - 11.1583 - /* 11.1584 - * XXX When we make this support 4MB superpages we should also deal with 11.1585 - * the case of updating L2 entries. 11.1586 - */ 11.1587 - 11.1588 - if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], 11.1589 - mk_l1_pgentry(val))) ) 11.1590 - err = -EINVAL; 11.1591 - 11.1592 - if ( unlikely(shadow_mode(d)) ) 11.1593 - { 11.1594 - unsigned long sval = 0; 11.1595 - 11.1596 - l1pte_propagate_from_guest(d, &val, &sval); 11.1597 - 11.1598 - if ( unlikely(__put_user(sval, ((unsigned long *)( 11.1599 - &shadow_linear_pg_table[page_nr])))) ) 11.1600 - { 11.1601 - /* 11.1602 - * Since L2's are guranteed RW, failure indicates the page was not 11.1603 - * shadowed, so ignore. 11.1604 - */ 11.1605 - perfc_incrc(shadow_update_va_fail); 11.1606 - } 11.1607 - 11.1608 - /* 11.1609 - * If we're in log-dirty mode then we need to note that we've updated 11.1610 - * the PTE in the PT-holding page. We need the machine frame number 11.1611 - * for this. 11.1612 - */ 11.1613 - if ( shadow_mode(d) == SHM_logdirty ) 11.1614 - mark_dirty(d, va_to_l1mfn(page_nr << PAGE_SHIFT)); 11.1615 - 11.1616 - check_pagetable(d, ed->arch.pagetable, "va"); /* debug */ 11.1617 - } 11.1618 - 11.1619 - deferred_ops = percpu_info[cpu].deferred_ops; 11.1620 - percpu_info[cpu].deferred_ops = 0; 11.1621 - 11.1622 - if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 11.1623 - unlikely(flags & UVMF_FLUSH_TLB) ) 11.1624 - local_flush_tlb(); 11.1625 - else if ( unlikely(flags & UVMF_INVLPG) ) 11.1626 - __flush_tlb_one(page_nr << PAGE_SHIFT); 11.1627 - 11.1628 - if ( unlikely(deferred_ops & DOP_RELOAD_LDT) ) 11.1629 - (void)map_ldt_shadow_page(0); 11.1630 - 11.1631 - UNLOCK_BIGLOCK(d); 11.1632 - 11.1633 - return err; 11.1634 -} 11.1635 - 11.1636 -int do_update_va_mapping_otherdomain(unsigned long page_nr, 11.1637 - unsigned long val, 11.1638 - unsigned long flags, 11.1639 - domid_t domid) 11.1640 -{ 11.1641 - unsigned int cpu = smp_processor_id(); 11.1642 - struct domain *d; 11.1643 - int rc; 11.1644 - 11.1645 - if ( unlikely(!IS_PRIV(current->domain)) ) 11.1646 - return -EPERM; 11.1647 - 11.1648 - percpu_info[cpu].foreign = d = find_domain_by_id(domid); 11.1649 - if ( unlikely(d == NULL) ) 11.1650 - { 11.1651 - MEM_LOG("Unknown domain '%u'", domid); 11.1652 - return -ESRCH; 11.1653 - } 11.1654 - 11.1655 - rc = do_update_va_mapping(page_nr, val, flags); 11.1656 - 11.1657 - put_domain(d); 11.1658 - percpu_info[cpu].foreign = NULL; 11.1659 - 11.1660 - return rc; 11.1661 -} 11.1662 - 11.1663 - 11.1664 - 11.1665 -/************************* 11.1666 - * Descriptor Tables 11.1667 - */ 11.1668 - 11.1669 -void destroy_gdt(struct exec_domain *ed) 11.1670 -{ 11.1671 - int i; 11.1672 - unsigned long pfn; 11.1673 - 11.1674 - for ( i = 0; i < 16; i++ ) 11.1675 - { 11.1676 - if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 ) 11.1677 - put_page_and_type(&frame_table[pfn]); 11.1678 - ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0); 11.1679 - } 11.1680 -} 11.1681 - 11.1682 - 11.1683 -long set_gdt(struct exec_domain *ed, 11.1684 - unsigned long *frames, 11.1685 - unsigned int entries) 11.1686 -{ 11.1687 - struct domain *d = ed->domain; 11.1688 - /* NB. There are 512 8-byte entries per GDT page. */ 11.1689 - int i = 0, nr_pages = (entries + 511) / 512; 11.1690 - struct desc_struct *vgdt; 11.1691 - unsigned long pfn; 11.1692 - 11.1693 - /* Check the first page in the new GDT. */ 11.1694 - if ( (pfn = frames[0]) >= max_page ) 11.1695 - goto fail; 11.1696 - 11.1697 - /* The first page is special because Xen owns a range of entries in it. */ 11.1698 - if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 11.1699 - { 11.1700 - /* GDT checks failed: try zapping the Xen reserved entries. */ 11.1701 - if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) ) 11.1702 - goto fail; 11.1703 - vgdt = map_domain_mem(pfn << PAGE_SHIFT); 11.1704 - memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0, 11.1705 - NR_RESERVED_GDT_ENTRIES*8); 11.1706 - unmap_domain_mem(vgdt); 11.1707 - put_page_and_type(&frame_table[pfn]); 11.1708 - 11.1709 - /* Okay, we zapped the entries. Now try the GDT checks again. */ 11.1710 - if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 11.1711 - goto fail; 11.1712 - } 11.1713 - 11.1714 - /* Check the remaining pages in the new GDT. */ 11.1715 - for ( i = 1; i < nr_pages; i++ ) 11.1716 - if ( ((pfn = frames[i]) >= max_page) || 11.1717 - !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 11.1718 - goto fail; 11.1719 - 11.1720 - /* Copy reserved GDT entries to the new GDT. */ 11.1721 - vgdt = map_domain_mem(frames[0] << PAGE_SHIFT); 11.1722 - memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 11.1723 - gdt_table + FIRST_RESERVED_GDT_ENTRY, 11.1724 - NR_RESERVED_GDT_ENTRIES*8); 11.1725 - unmap_domain_mem(vgdt); 11.1726 - 11.1727 - /* Tear down the old GDT. */ 11.1728 - destroy_gdt(ed); 11.1729 - 11.1730 - /* Install the new GDT. */ 11.1731 - for ( i = 0; i < nr_pages; i++ ) 11.1732 - ed->arch.perdomain_ptes[i] = 11.1733 - mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); 11.1734 - 11.1735 - SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed)); 11.1736 - SET_GDT_ENTRIES(ed, entries); 11.1737 - 11.1738 - return 0; 11.1739 - 11.1740 - fail: 11.1741 - while ( i-- > 0 ) 11.1742 - put_page_and_type(&frame_table[frames[i]]); 11.1743 - return -EINVAL; 11.1744 -} 11.1745 - 11.1746 - 11.1747 -long do_set_gdt(unsigned long *frame_list, unsigned int entries) 11.1748 -{ 11.1749 - int nr_pages = (entries + 511) / 512; 11.1750 - unsigned long frames[16]; 11.1751 - long ret; 11.1752 - 11.1753 - if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 11.1754 - return -EINVAL; 11.1755 - 11.1756 - if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) ) 11.1757 - return -EFAULT; 11.1758 - 11.1759 - LOCK_BIGLOCK(current->domain); 11.1760 - 11.1761 - if ( (ret = set_gdt(current, frames, entries)) == 0 ) 11.1762 - { 11.1763 - local_flush_tlb(); 11.1764 - __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt)); 11.1765 - } 11.1766 - 11.1767 - UNLOCK_BIGLOCK(current->domain); 11.1768 - 11.1769 - return ret; 11.1770 -} 11.1771 - 11.1772 - 11.1773 -long do_update_descriptor( 11.1774 - unsigned long pa, unsigned long word1, unsigned long word2) 11.1775 -{ 11.1776 - unsigned long pfn = pa >> PAGE_SHIFT; 11.1777 - struct desc_struct *gdt_pent, d; 11.1778 - struct pfn_info *page; 11.1779 - struct exec_domain *ed; 11.1780 - long ret = -EINVAL; 11.1781 - 11.1782 - d.a = (u32)word1; 11.1783 - d.b = (u32)word2; 11.1784 - 11.1785 - LOCK_BIGLOCK(current->domain); 11.1786 - 11.1787 - if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) { 11.1788 - UNLOCK_BIGLOCK(current->domain); 11.1789 - return -EINVAL; 11.1790 - } 11.1791 - 11.1792 - page = &frame_table[pfn]; 11.1793 - if ( unlikely(!get_page(page, current->domain)) ) { 11.1794 - UNLOCK_BIGLOCK(current->domain); 11.1795 - return -EINVAL; 11.1796 - } 11.1797 - 11.1798 - /* Check if the given frame is in use in an unsafe context. */ 11.1799 - switch ( page->u.inuse.type_info & PGT_type_mask ) 11.1800 - { 11.1801 - case PGT_gdt_page: 11.1802 - /* Disallow updates of Xen-reserved descriptors in the current GDT. */ 11.1803 - for_each_exec_domain(current->domain, ed) { 11.1804 - if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) && 11.1805 - (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && 11.1806 - (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) 11.1807 - goto out; 11.1808 - } 11.1809 - if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) 11.1810 - goto out; 11.1811 - break; 11.1812 - case PGT_ldt_page: 11.1813 - if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) 11.1814 - goto out; 11.1815 - break; 11.1816 - default: 11.1817 - if ( unlikely(!get_page_type(page, PGT_writable_page)) ) 11.1818 - goto out; 11.1819 - break; 11.1820 - } 11.1821 - 11.1822 - /* All is good so make the update. */ 11.1823 - gdt_pent = map_domain_mem(pa); 11.1824 - memcpy(gdt_pent, &d, 8); 11.1825 - unmap_domain_mem(gdt_pent); 11.1826 - 11.1827 - put_page_type(page); 11.1828 - 11.1829 - ret = 0; /* success */ 11.1830 - 11.1831 - out: 11.1832 - put_page(page); 11.1833 - 11.1834 - UNLOCK_BIGLOCK(current->domain); 11.1835 - 11.1836 - return ret; 11.1837 -} 11.1838 - 11.1839 - 11.1840 - 11.1841 -/************************* 11.1842 - * Writable Pagetables 11.1843 - */ 11.1844 - 11.1845 -ptwr_info_t ptwr_info[NR_CPUS]; 11.1846 - 11.1847 -#ifdef VERBOSE 11.1848 -int ptwr_debug = 0x0; 11.1849 -#define PTWR_PRINTK(_f, _a...) \ 11.1850 - do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 ) 11.1851 -#define PTWR_PRINT_WHICH (which ? 'I' : 'A') 11.1852 -#else 11.1853 -#define PTWR_PRINTK(_f, _a...) ((void)0) 11.1854 -#endif 11.1855 - 11.1856 -/* Flush the given writable p.t. page and write-protect it again. */ 11.1857 -void ptwr_flush(const int which) 11.1858 -{ 11.1859 - unsigned long sstat, spte, pte, *ptep, l1va; 11.1860 - l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e; 11.1861 - l2_pgentry_t *pl2e; 11.1862 - int i, cpu = smp_processor_id(); 11.1863 - struct exec_domain *ed = current; 11.1864 - struct domain *d = ed->domain; 11.1865 - 11.1866 - l1va = ptwr_info[cpu].ptinfo[which].l1va; 11.1867 - ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT]; 11.1868 - 11.1869 - /* 11.1870 - * STEP 1. Write-protect the p.t. page so no more updates can occur. 11.1871 - */ 11.1872 - 11.1873 - if ( unlikely(__get_user(pte, ptep)) ) 11.1874 - { 11.1875 - MEM_LOG("ptwr: Could not read pte at %p\n", ptep); 11.1876 - /* 11.1877 - * Really a bug. We could read this PTE during the initial fault, 11.1878 - * and pagetables can't have changed meantime. XXX Multi-CPU guests? 11.1879 - */ 11.1880 - BUG(); 11.1881 - } 11.1882 - PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n", 11.1883 - PTWR_PRINT_WHICH, ptep, pte); 11.1884 - pte &= ~_PAGE_RW; 11.1885 - 11.1886 - if ( unlikely(shadow_mode(d)) ) 11.1887 - { 11.1888 - /* Write-protect the p.t. page in the shadow page table. */ 11.1889 - l1pte_propagate_from_guest(d, &pte, &spte); 11.1890 - __put_user( 11.1891 - spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]); 11.1892 - 11.1893 - /* Is the p.t. page itself shadowed? Map it into Xen space if so. */ 11.1894 - sstat = get_shadow_status(d, pte >> PAGE_SHIFT); 11.1895 - if ( sstat & PSH_shadowed ) 11.1896 - sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT); 11.1897 - } 11.1898 - 11.1899 - /* Write-protect the p.t. page in the guest page table. */ 11.1900 - if ( unlikely(__put_user(pte, ptep)) ) 11.1901 - { 11.1902 - MEM_LOG("ptwr: Could not update pte at %p\n", ptep); 11.1903 - /* 11.1904 - * Really a bug. We could write this PTE during the initial fault, 11.1905 - * and pagetables can't have changed meantime. XXX Multi-CPU guests? 11.1906 - */ 11.1907 - BUG(); 11.1908 - } 11.1909 - 11.1910 - /* Ensure that there are no stale writable mappings in any TLB. */ 11.1911 - /* NB. INVLPG is a serialising instruction: flushes pending updates. */ 11.1912 -#if 1 11.1913 - __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */ 11.1914 -#else 11.1915 - flush_tlb_all(); 11.1916 -#endif 11.1917 - PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n", 11.1918 - PTWR_PRINT_WHICH, ptep, pte); 11.1919 - 11.1920 - /* 11.1921 - * STEP 2. Validate any modified PTEs. 11.1922 - */ 11.1923 - 11.1924 - pl1e = ptwr_info[cpu].ptinfo[which].pl1e; 11.1925 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 11.1926 - { 11.1927 - ol1e = ptwr_info[cpu].ptinfo[which].page[i]; 11.1928 - nl1e = pl1e[i]; 11.1929 - 11.1930 - if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) ) 11.1931 - continue; 11.1932 - 11.1933 - /* 11.1934 - * Fast path for PTEs that have merely been write-protected 11.1935 - * (e.g., during a Unix fork()). A strict reduction in privilege. 11.1936 - */ 11.1937 - if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) ) 11.1938 - { 11.1939 - if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) ) 11.1940 - { 11.1941 - if ( unlikely(sl1e != NULL) ) 11.1942 - l1pte_propagate_from_guest( 11.1943 - d, &l1_pgentry_val(nl1e), 11.1944 - &l1_pgentry_val(sl1e[i])); 11.1945 - put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]); 11.1946 - } 11.1947 - continue; 11.1948 - } 11.1949 - 11.1950 - if ( unlikely(!get_page_from_l1e(nl1e, d)) ) 11.1951 - { 11.1952 - MEM_LOG("ptwr: Could not re-validate l1 page\n"); 11.1953 - /* 11.1954 - * Make the remaining p.t's consistent before crashing, so the 11.1955 - * reference counts are correct. 11.1956 - */ 11.1957 - memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i], 11.1958 - (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t)); 11.1959 - unmap_domain_mem(pl1e); 11.1960 - ptwr_info[cpu].ptinfo[which].l1va = 0; 11.1961 - UNLOCK_BIGLOCK(d); 11.1962 - domain_crash(); 11.1963 - } 11.1964 - 11.1965 - if ( unlikely(sl1e != NULL) ) 11.1966 - l1pte_propagate_from_guest( 11.1967 - d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i])); 11.1968 - 11.1969 - if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) ) 11.1970 - put_page_from_l1e(ol1e, d); 11.1971 - } 11.1972 - unmap_domain_mem(pl1e); 11.1973 - 11.1974 - /* 11.1975 - * STEP 3. Reattach the L1 p.t. page into the current address space. 11.1976 - */ 11.1977 - 11.1978 - if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) ) 11.1979 - { 11.1980 - pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx]; 11.1981 - *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 11.1982 - } 11.1983 - 11.1984 - /* 11.1985 - * STEP 4. Final tidy-up. 11.1986 - */ 11.1987 - 11.1988 - ptwr_info[cpu].ptinfo[which].l1va = 0; 11.1989 - 11.1990 - if ( unlikely(sl1e != NULL) ) 11.1991 - { 11.1992 - unmap_domain_mem(sl1e); 11.1993 - put_shadow_status(d); 11.1994 - } 11.1995 -} 11.1996 - 11.1997 -/* Write page fault handler: check if guest is trying to modify a PTE. */ 11.1998 -int ptwr_do_page_fault(unsigned long addr) 11.1999 -{ 11.2000 - unsigned long pte, pfn, l2e; 11.2001 - struct pfn_info *page; 11.2002 - l2_pgentry_t *pl2e; 11.2003 - int which, cpu = smp_processor_id(); 11.2004 - u32 l2_idx; 11.2005 - 11.2006 - /* 11.2007 - * Attempt to read the PTE that maps the VA being accessed. By checking for 11.2008 - * PDE validity in the L2 we avoid many expensive fixups in __get_user(). 11.2009 - */ 11.2010 - if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) & 11.2011 - _PAGE_PRESENT) || 11.2012 - __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) ) 11.2013 - { 11.2014 - return 0; 11.2015 - } 11.2016 - 11.2017 - pfn = pte >> PAGE_SHIFT; 11.2018 - page = &frame_table[pfn]; 11.2019 - 11.2020 - /* We are looking only for read-only mappings of p.t. pages. */ 11.2021 - if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) || 11.2022 - ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ) 11.2023 - { 11.2024 - return 0; 11.2025 - } 11.2026 - 11.2027 - /* Get the L2 index at which this L1 p.t. is always mapped. */ 11.2028 - l2_idx = page->u.inuse.type_info & PGT_va_mask; 11.2029 - if ( unlikely(l2_idx >= PGT_va_unknown) ) 11.2030 - { 11.2031 - domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */ 11.2032 - } 11.2033 - l2_idx >>= PGT_va_shift; 11.2034 - 11.2035 - if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) ) 11.2036 - { 11.2037 - MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr); 11.2038 - domain_crash(); 11.2039 - } 11.2040 - 11.2041 - /* 11.2042 - * Is the L1 p.t. mapped into the current address space? If so we call it 11.2043 - * an ACTIVE p.t., otherwise it is INACTIVE. 11.2044 - */ 11.2045 - pl2e = &linear_l2_table[l2_idx]; 11.2046 - l2e = l2_pgentry_val(*pl2e); 11.2047 - which = PTWR_PT_INACTIVE; 11.2048 - if ( (l2e >> PAGE_SHIFT) == pfn ) 11.2049 - { 11.2050 - /* Check the PRESENT bit to set ACTIVE. */ 11.2051 - if ( likely(l2e & _PAGE_PRESENT) ) 11.2052 - which = PTWR_PT_ACTIVE; 11.2053 - else { 11.2054 - /* 11.2055 - * If the PRESENT bit is clear, we may be conflicting with 11.2056 - * the current ACTIVE p.t. (it may be the same p.t. mapped 11.2057 - * at another virt addr). 11.2058 - * The ptwr_flush call below will restore the PRESENT bit. 11.2059 - */ 11.2060 - if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va && 11.2061 - l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx ) 11.2062 - which = PTWR_PT_ACTIVE; 11.2063 - } 11.2064 - } 11.2065 - 11.2066 - PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, " 11.2067 - "pfn %08lx\n", PTWR_PRINT_WHICH, 11.2068 - addr, l2_idx << L2_PAGETABLE_SHIFT, pfn); 11.2069 - 11.2070 - /* 11.2071 - * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 11.2072 - * time. If there is already one, we must flush it out. 11.2073 - */ 11.2074 - if ( ptwr_info[cpu].ptinfo[which].l1va ) 11.2075 - ptwr_flush(which); 11.2076 - 11.2077 - ptwr_info[cpu].ptinfo[which].l1va = addr | 1; 11.2078 - ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx; 11.2079 - 11.2080 - /* For safety, disconnect the L1 p.t. page from current space. */ 11.2081 - if ( (which == PTWR_PT_ACTIVE) && 11.2082 - likely(!shadow_mode(current->domain)) ) 11.2083 - { 11.2084 - *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT); 11.2085 -#if 1 11.2086 - flush_tlb(); /* XXX Multi-CPU guests? */ 11.2087 -#else 11.2088 - flush_tlb_all(); 11.2089 -#endif 11.2090 - } 11.2091 - 11.2092 - /* Temporarily map the L1 page, and make a copy of it. */ 11.2093 - ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT); 11.2094 - memcpy(ptwr_info[cpu].ptinfo[which].page, 11.2095 - ptwr_info[cpu].ptinfo[which].pl1e, 11.2096 - ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t)); 11.2097 - 11.2098 - /* Finally, make the p.t. page writable by the guest OS. */ 11.2099 - pte |= _PAGE_RW; 11.2100 - PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH, 11.2101 - &linear_pg_table[addr>>PAGE_SHIFT], pte); 11.2102 - if ( unlikely(__put_user(pte, (unsigned long *) 11.2103 - &linear_pg_table[addr>>PAGE_SHIFT])) ) 11.2104 - { 11.2105 - MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *) 11.2106 - &linear_pg_table[addr>>PAGE_SHIFT]); 11.2107 - /* Toss the writable pagetable state and crash. */ 11.2108 - unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e); 11.2109 - ptwr_info[cpu].ptinfo[which].l1va = 0; 11.2110 - domain_crash(); 11.2111 - } 11.2112 - 11.2113 - return EXCRET_fault_fixed; 11.2114 -} 11.2115 - 11.2116 -static __init int ptwr_init(void) 11.2117 -{ 11.2118 - int i; 11.2119 - 11.2120 - for ( i = 0; i < smp_num_cpus; i++ ) 11.2121 - { 11.2122 - ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page = 11.2123 - (void *)alloc_xenheap_page(); 11.2124 - ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page = 11.2125 - (void *)alloc_xenheap_page(); 11.2126 - } 11.2127 - 11.2128 - return 0; 11.2129 -} 11.2130 -__initcall(ptwr_init); 11.2131 - 11.2132 - 11.2133 - 11.2134 - 11.2135 -/************************************************************************/ 11.2136 -/************************************************************************/ 11.2137 -/************************************************************************/ 11.2138 - 11.2139 -#ifndef NDEBUG 11.2140 - 11.2141 -void ptwr_status(void) 11.2142 -{ 11.2143 - unsigned long pte, *ptep, pfn; 11.2144 - struct pfn_info *page; 11.2145 - int cpu = smp_processor_id(); 11.2146 - 11.2147 - ptep = (unsigned long *)&linear_pg_table 11.2148 - [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT]; 11.2149 - 11.2150 - if ( __get_user(pte, ptep) ) { 11.2151 - MEM_LOG("ptwr: Could not read pte at %p\n", ptep); 11.2152 - domain_crash(); 11.2153 - } 11.2154 - 11.2155 - pfn = pte >> PAGE_SHIFT; 11.2156 - page = &frame_table[pfn]; 11.2157 - printk("need to alloc l1 page %p\n", page); 11.2158 - /* make pt page writable */ 11.2159 - printk("need to make read-only l1-page at %p is %08lx\n", 11.2160 - ptep, pte); 11.2161 - 11.2162 - if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 ) 11.2163 - return; 11.2164 - 11.2165 - if ( __get_user(pte, (unsigned long *) 11.2166 - ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) { 11.2167 - MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *) 11.2168 - ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va); 11.2169 - domain_crash(); 11.2170 - } 11.2171 - pfn = pte >> PAGE_SHIFT; 11.2172 - page = &frame_table[pfn]; 11.2173 -} 11.2174 - 11.2175 -void audit_domain(struct domain *d) 11.2176 -{ 11.2177 - int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; 11.2178 - 11.2179 - void adjust (struct pfn_info *page, int dir, int adjtype) 11.2180 - { 11.2181 - int count = page->count_info & PGC_count_mask; 11.2182 - 11.2183 - if ( adjtype ) 11.2184 - { 11.2185 - int tcount = page->u.inuse.type_info & PGT_count_mask; 11.2186 - 11.2187 - ttot++; 11.2188 - 11.2189 - tcount += dir; 11.2190 - 11.2191 - if ( tcount < 0 ) 11.2192 - { 11.2193 - /* This will only come out once. */ 11.2194 - printk("Audit %d: type count whent below zero pfn=%x " 11.2195 - "taf=%x otaf=%x\n", 11.2196 - d->id, page-frame_table, 11.2197 - page->u.inuse.type_info, 11.2198 - page->tlbflush_timestamp); 11.2199 - } 11.2200 - 11.2201 - page->u.inuse.type_info = 11.2202 - (page->u.inuse.type_info & ~PGT_count_mask) | 11.2203 - (tcount & PGT_count_mask); 11.2204 - } 11.2205 - 11.2206 - ctot++; 11.2207 - count += dir; 11.2208 - if ( count < 0 ) 11.2209 - { 11.2210 - /* This will only come out once. */ 11.2211 - printk("Audit %d: general count whent below zero pfn=%x " 11.2212 - "taf=%x otaf=%x\n", 11.2213 - d->id, page-frame_table, 11.2214 - page->u.inuse.type_info, 11.2215 - page->tlbflush_timestamp); 11.2216 - } 11.2217 - 11.2218 - page->count_info = 11.2219 - (page->count_info & ~PGC_count_mask) | 11.2220 - (count & PGC_count_mask); 11.2221 - 11.2222 - } 11.2223 - 11.2224 - void scan_for_pfn(struct domain *d, unsigned long xpfn) 11.2225 - { 11.2226 - unsigned long pfn, *pt; 11.2227 - struct list_head *list_ent; 11.2228 - struct pfn_info *page; 11.2229 - int i; 11.2230 - 11.2231 - list_ent = d->page_list.next; 11.2232 - for ( i = 0; (list_ent != &d->page_list); i++ ) 11.2233 - { 11.2234 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 11.2235 - page = &frame_table[pfn]; 11.2236 - 11.2237 - switch ( page->u.inuse.type_info & PGT_type_mask ) 11.2238 - { 11.2239 - case PGT_l1_page_table: 11.2240 - case PGT_l2_page_table: 11.2241 - pt = map_domain_mem(pfn<<PAGE_SHIFT); 11.2242 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 11.2243 - if ( (pt[i] & _PAGE_PRESENT) && 11.2244 - ((pt[i] >> PAGE_SHIFT) == xpfn) ) 11.2245 - printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n", 11.2246 - d->id, i, pfn, page->u.inuse.type_info, 11.2247 - page->count_info); 11.2248 - unmap_domain_mem(pt); 11.2249 - } 11.2250 - 11.2251 - list_ent = frame_table[pfn].list.next; 11.2252 - } 11.2253 - 11.2254 - } 11.2255 - 11.2256 - void scan_for_pfn_remote(unsigned long xpfn) 11.2257 - { 11.2258 - struct domain *e; 11.2259 - for_each_domain ( e ) 11.2260 - scan_for_pfn( e, xpfn ); 11.2261 - } 11.2262 - 11.2263 - int i; 11.2264 - unsigned long pfn; 11.2265 - struct list_head *list_ent; 11.2266 - struct pfn_info *page; 11.2267 - 11.2268 - if ( d != current->domain ) 11.2269 - domain_pause(d); 11.2270 - synchronise_pagetables(~0UL); 11.2271 - 11.2272 - printk("pt base=%lx sh_info=%x\n", 11.2273 - pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT, 11.2274 - virt_to_page(d->shared_info)-frame_table); 11.2275 - 11.2276 - spin_lock(&d->page_alloc_lock); 11.2277 - 11.2278 - /* PHASE 0 */ 11.2279 - 11.2280 - list_ent = d->page_list.next; 11.2281 - for ( i = 0; (list_ent != &d->page_list); i++ ) 11.2282 - { 11.2283 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 11.2284 - page = &frame_table[pfn]; 11.2285 - 11.2286 - if ( page_get_owner(page) != d ) 11.2287 - BUG(); 11.2288 - 11.2289 - if ( (page->u.inuse.type_info & PGT_count_mask) > 11.2290 - (page->count_info & PGC_count_mask) ) 11.2291 - printk("taf > caf %x %x pfn=%lx\n", 11.2292 - page->u.inuse.type_info, page->count_info, pfn ); 11.2293 - 11.2294 -#if 0 /* SYSV shared memory pages plus writeable files. */ 11.2295 - if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 11.2296 - (page->u.inuse.type_info & PGT_count_mask) > 1 ) 11.2297 - { 11.2298 - printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n", 11.2299 - pfn, 11.2300 - page->u.inuse.type_info, 11.2301 - page->count_info ); 11.2302 - scan_for_pfn_remote(pfn); 11.2303 - } 11.2304 -#endif 11.2305 - if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 11.2306 - (page->u.inuse.type_info & PGT_count_mask) > 1 ) 11.2307 - { 11.2308 - printk("normal page with type count >1: pfn=%lx t=%x c=%x\n", 11.2309 - pfn, 11.2310 - page->u.inuse.type_info, 11.2311 - page->count_info ); 11.2312 - } 11.2313 - 11.2314 - /* Use tlbflush_timestamp to store original type_info. */ 11.2315 - page->tlbflush_timestamp = page->u.inuse.type_info; 11.2316 - 11.2317 - list_ent = frame_table[pfn].list.next; 11.2318 - } 11.2319 - 11.2320 - 11.2321 - /* PHASE 1 */ 11.2322 - 11.2323 - adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1); 11.2324 - 11.2325 - list_ent = d->page_list.next; 11.2326 - for ( i = 0; (list_ent != &d->page_list); i++ ) 11.2327 - { 11.2328 - unsigned long *pt; 11.2329 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 11.2330 - page = &frame_table[pfn]; 11.2331 - 11.2332 - if ( page_get_owner(page) != d ) 11.2333 - BUG(); 11.2334 - 11.2335 - switch ( page->u.inuse.type_info & PGT_type_mask ) 11.2336 - { 11.2337 - case PGT_l2_page_table: 11.2338 - 11.2339 - if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) 11.2340 - printk("Audit %d: L2 not validated %x\n", 11.2341 - d->id, page->u.inuse.type_info); 11.2342 - 11.2343 - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) 11.2344 - printk("Audit %d: L2 not pinned %x\n", 11.2345 - d->id, page->u.inuse.type_info); 11.2346 - else 11.2347 - adjust( page, -1, 1 ); 11.2348 - 11.2349 - pt = map_domain_mem( pfn<<PAGE_SHIFT ); 11.2350 - 11.2351 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 11.2352 - { 11.2353 - if ( pt[i] & _PAGE_PRESENT ) 11.2354 - { 11.2355 - unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 11.2356 - struct pfn_info *l1page = &frame_table[l1pfn]; 11.2357 - 11.2358 - if ( page_get_owner(l1page) != d ) 11.2359 - { 11.2360 - printk("L2: Skip bizarre page belonging to other " 11.2361 - "dom %p\n", page_get_owner(l1page)); 11.2362 - continue; 11.2363 - } 11.2364 - 11.2365 - if ( (l1page->u.inuse.type_info & PGT_type_mask) == 11.2366 - PGT_l2_page_table ) 11.2367 - printk("Audit %d: [%x] Found %s Linear PT " 11.2368 - "t=%x pfn=%lx\n", d->id, i, 11.2369 - (l1pfn==pfn) ? "Self" : "Other", 11.2370 - l1page->u.inuse.type_info, 11.2371 - l1pfn); 11.2372 - else if ( (l1page->u.inuse.type_info & PGT_type_mask) != 11.2373 - PGT_l1_page_table ) 11.2374 - printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n", 11.2375 - d->id, i, 11.2376 - l1page->u.inuse.type_info, 11.2377 - l1pfn); 11.2378 - 11.2379 - adjust(l1page, -1, 1); 11.2380 - } 11.2381 - } 11.2382 - 11.2383 - unmap_domain_mem(pt); 11.2384 - 11.2385 - break; 11.2386 - 11.2387 - 11.2388 - case PGT_l1_page_table: 11.2389 - 11.2390 - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 11.2391 - adjust( page, -1, 1 ); 11.2392 - 11.2393 - if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) 11.2394 - printk("Audit %d: L1 not validated %x\n", 11.2395 - d->id, page->u.inuse.type_info); 11.2396 -#if 0 11.2397 - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) 11.2398 - printk("Audit %d: L1 not pinned %x\n", 11.2399 - d->id, page->u.inuse.type_info); 11.2400 -#endif 11.2401 - pt = map_domain_mem( pfn<<PAGE_SHIFT ); 11.2402 - 11.2403 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 11.2404 - { 11.2405 - if ( pt[i] & _PAGE_PRESENT ) 11.2406 - { 11.2407 - unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 11.2408 - struct pfn_info *l1page = &frame_table[l1pfn]; 11.2409 - 11.2410 - if ( l1pfn < 0x100 ) 11.2411 - { 11.2412 - lowmem_mappings++; 11.2413 - continue; 11.2414 - } 11.2415 - 11.2416 - if ( l1pfn > max_page ) 11.2417 - { 11.2418 - io_mappings++; 11.2419 - continue; 11.2420 - } 11.2421 - 11.2422 - if ( pt[i] & _PAGE_RW ) 11.2423 - { 11.2424 - 11.2425 - if ( (l1page->u.inuse.type_info & PGT_type_mask) == 11.2426 - PGT_l1_page_table || 11.2427 - (l1page->u.inuse.type_info & PGT_type_mask) == 11.2428 - PGT_l2_page_table ) 11.2429 - printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n", 11.2430 - d->id, i, 11.2431 - l1page->u.inuse.type_info, 11.2432 - l1pfn); 11.2433 - 11.2434 - } 11.2435 - 11.2436 - if ( page_get_owner(l1page) != d ) 11.2437 - { 11.2438 - printk("Audit %d: [%lx,%x] Skip foreign page dom=%p " 11.2439 - "pfn=%lx c=%08x t=%08x m2p=%lx\n", 11.2440 - d->id, pfn, i, 11.2441 - page_get_owner(l1page), 11.2442 - l1pfn, 11.2443 - l1page->count_info, 11.2444 - l1page->u.inuse.type_info, 11.2445 - machine_to_phys_mapping[l1pfn]); 11.2446 - continue; 11.2447 - } 11.2448 - 11.2449 - adjust(l1page, -1, 0); 11.2450 - } 11.2451 - } 11.2452 - 11.2453 - unmap_domain_mem(pt); 11.2454 - 11.2455 - break; 11.2456 - } 11.2457 - 11.2458 - list_ent = frame_table[pfn].list.next; 11.2459 - } 11.2460 - 11.2461 - if ( (io_mappings > 0) || (lowmem_mappings > 0) ) 11.2462 - printk("Audit %d: Found %d lowmem mappings and %d io mappings\n", 11.2463 - d->id, lowmem_mappings, io_mappings); 11.2464 - 11.2465 - /* PHASE 2 */ 11.2466 - 11.2467 - ctot = ttot = 0; 11.2468 - list_ent = d->page_list.next; 11.2469 - for ( i = 0; (list_ent != &d->page_list); i++ ) 11.2470 - { 11.2471 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 11.2472 - page = &frame_table[pfn]; 11.2473 - 11.2474 - switch ( page->u.inuse.type_info & PGT_type_mask) 11.2475 - { 11.2476 - case PGT_l1_page_table: 11.2477 - case PGT_l2_page_table: 11.2478 - if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) 11.2479 - { 11.2480 - printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n", 11.2481 - d->id, page->u.inuse.type_info, 11.2482 - page->tlbflush_timestamp, 11.2483 - page->count_info, pfn ); 11.2484 - scan_for_pfn_remote(pfn); 11.2485 - } 11.2486 - default: 11.2487 - if ( (page->count_info & PGC_count_mask) != 1 ) 11.2488 - { 11.2489 - printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n", 11.2490 - d->id, 11.2491 - page->count_info, 11.2492 - page->u.inuse.type_info, 11.2493 - page->tlbflush_timestamp, pfn ); 11.2494 - scan_for_pfn_remote(pfn); 11.2495 - } 11.2496 - break; 11.2497 - } 11.2498 - 11.2499 - list_ent = frame_table[pfn].list.next; 11.2500 - } 11.2501 - 11.2502 - /* PHASE 3 */ 11.2503 - list_ent = d->page_list.next; 11.2504 - for ( i = 0; (list_ent != &d->page_list); i++ ) 11.2505 - { 11.2506 - unsigned long *pt; 11.2507 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 11.2508 - page = &frame_table[pfn]; 11.2509 - 11.2510 - switch ( page->u.inuse.type_info & PGT_type_mask ) 11.2511 - { 11.2512 - case PGT_l2_page_table: 11.2513 - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 11.2514 - adjust( page, 1, 1 ); 11.2515 - 11.2516 - pt = map_domain_mem( pfn<<PAGE_SHIFT ); 11.2517 - 11.2518 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 11.2519 - { 11.2520 - if ( pt[i] & _PAGE_PRESENT ) 11.2521 - { 11.2522 - unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 11.2523 - struct pfn_info *l1page; 11.2524 - 11.2525 - if (l1pfn>max_page) 11.2526 - continue; 11.2527 - 11.2528 - l1page = &frame_table[l1pfn]; 11.2529 - 11.2530 - if ( page_get_owner(l1page) == d ) 11.2531 - adjust(l1page, 1, 1); 11.2532 - } 11.2533 - } 11.2534 - 11.2535 - unmap_domain_mem(pt); 11.2536 - break; 11.2537 - 11.2538 - case PGT_l1_page_table: 11.2539 - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 11.2540 - adjust( page, 1, 1 ); 11.2541 - 11.2542 - pt = map_domain_mem( pfn<<PAGE_SHIFT ); 11.2543 - 11.2544 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 11.2545 - { 11.2546 - if ( pt[i] & _PAGE_PRESENT ) 11.2547 - { 11.2548 - unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 11.2549 - struct pfn_info *l1page; 11.2550 - 11.2551 - if (l1pfn>max_page) 11.2552 - continue; 11.2553 - 11.2554 - l1page = &frame_table[l1pfn]; 11.2555 - 11.2556 - if ( (page_get_owner(l1page) != d) || 11.2557 - (l1pfn < 0x100) || (l1pfn > max_page) ) 11.2558 - continue; 11.2559 - 11.2560 - adjust(l1page, 1, 0); 11.2561 - } 11.2562 - } 11.2563 - 11.2564 - unmap_domain_mem(pt); 11.2565 - break; 11.2566 - } 11.2567 - 11.2568 - 11.2569 - page->tlbflush_timestamp = 0; 11.2570 - 11.2571 - list_ent = frame_table[pfn].list.next; 11.2572 - } 11.2573 - 11.2574 - spin_unlock(&d->page_alloc_lock); 11.2575 - 11.2576 - adjust(&frame_table[pagetable_val( 11.2577 - d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1); 11.2578 - 11.2579 - printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot ); 11.2580 - 11.2581 - if ( d != current->domain ) 11.2582 - domain_unpause(d); 11.2583 -} 11.2584 - 11.2585 -void audit_domains(void) 11.2586 -{ 11.2587 - struct domain *d; 11.2588 - for_each_domain ( d ) 11.2589 - audit_domain(d); 11.2590 -} 11.2591 - 11.2592 -void audit_domains_key(unsigned char key) 11.2593 -{ 11.2594 - audit_domains(); 11.2595 -} 11.2596 - 11.2597 -#endif
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 12.2 +++ b/xen/arch/x86/mm.c Tue Feb 08 15:13:51 2005 +0000 12.3 @@ -0,0 +1,2598 @@ 12.4 +/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ 12.5 +/****************************************************************************** 12.6 + * arch/x86/mm.c 12.7 + * 12.8 + * Copyright (c) 2002-2005 K A Fraser 12.9 + * Copyright (c) 2004 Christian Limpach 12.10 + * 12.11 + * This program is free software; you can redistribute it and/or modify 12.12 + * it under the terms of the GNU General Public License as published by 12.13 + * the Free Software Foundation; either version 2 of the License, or 12.14 + * (at your option) any later version. 12.15 + * 12.16 + * This program is distributed in the hope that it will be useful, 12.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12.19 + * GNU General Public License for more details. 12.20 + * 12.21 + * You should have received a copy of the GNU General Public License 12.22 + * along with this program; if not, write to the Free Software 12.23 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 12.24 + */ 12.25 + 12.26 +/* 12.27 + * A description of the x86 page table API: 12.28 + * 12.29 + * Domains trap to do_mmu_update with a list of update requests. 12.30 + * This is a list of (ptr, val) pairs, where the requested operation 12.31 + * is *ptr = val. 12.32 + * 12.33 + * Reference counting of pages: 12.34 + * ---------------------------- 12.35 + * Each page has two refcounts: tot_count and type_count. 12.36 + * 12.37 + * TOT_COUNT is the obvious reference count. It counts all uses of a 12.38 + * physical page frame by a domain, including uses as a page directory, 12.39 + * a page table, or simple mappings via a PTE. This count prevents a 12.40 + * domain from releasing a frame back to the free pool when it still holds 12.41 + * a reference to it. 12.42 + * 12.43 + * TYPE_COUNT is more subtle. A frame can be put to one of three 12.44 + * mutually-exclusive uses: it might be used as a page directory, or a 12.45 + * page table, or it may be mapped writable by the domain [of course, a 12.46 + * frame may not be used in any of these three ways!]. 12.47 + * So, type_count is a count of the number of times a frame is being 12.48 + * referred to in its current incarnation. Therefore, a page can only 12.49 + * change its type when its type count is zero. 12.50 + * 12.51 + * Pinning the page type: 12.52 + * ---------------------- 12.53 + * The type of a page can be pinned/unpinned with the commands 12.54 + * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, 12.55 + * pinning is not reference counted, so it can't be nested). 12.56 + * This is useful to prevent a page's type count falling to zero, at which 12.57 + * point safety checks would need to be carried out next time the count 12.58 + * is increased again. 12.59 + * 12.60 + * A further note on writable page mappings: 12.61 + * ----------------------------------------- 12.62 + * For simplicity, the count of writable mappings for a page may not 12.63 + * correspond to reality. The 'writable count' is incremented for every 12.64 + * PTE which maps the page with the _PAGE_RW flag set. However, for 12.65 + * write access to be possible the page directory entry must also have 12.66 + * its _PAGE_RW bit set. We do not check this as it complicates the 12.67 + * reference counting considerably [consider the case of multiple 12.68 + * directory entries referencing a single page table, some with the RW 12.69 + * bit set, others not -- it starts getting a bit messy]. 12.70 + * In normal use, this simplification shouldn't be a problem. 12.71 + * However, the logic can be added if required. 12.72 + * 12.73 + * One more note on read-only page mappings: 12.74 + * ----------------------------------------- 12.75 + * We want domains to be able to map pages for read-only access. The 12.76 + * main reason is that page tables and directories should be readable 12.77 + * by a domain, but it would not be safe for them to be writable. 12.78 + * However, domains have free access to rings 1 & 2 of the Intel 12.79 + * privilege model. In terms of page protection, these are considered 12.80 + * to be part of 'supervisor mode'. The WP bit in CR0 controls whether 12.81 + * read-only restrictions are respected in supervisor mode -- if the 12.82 + * bit is clear then any mapped page is writable. 12.83 + * 12.84 + * We get round this by always setting the WP bit and disallowing 12.85 + * updates to it. This is very unlikely to cause a problem for guest 12.86 + * OS's, which will generally use the WP bit to simplify copy-on-write 12.87 + * implementation (in that case, OS wants a fault when it writes to 12.88 + * an application-supplied buffer). 12.89 + */ 12.90 + 12.91 +#include <xen/config.h> 12.92 +#include <xen/init.h> 12.93 +#include <xen/kernel.h> 12.94 +#include <xen/lib.h> 12.95 +#include <xen/mm.h> 12.96 +#include <xen/sched.h> 12.97 +#include <xen/errno.h> 12.98 +#include <xen/perfc.h> 12.99 +#include <xen/irq.h> 12.100 +#include <xen/softirq.h> 12.101 +#include <asm/shadow.h> 12.102 +#include <asm/page.h> 12.103 +#include <asm/flushtlb.h> 12.104 +#include <asm/io.h> 12.105 +#include <asm/uaccess.h> 12.106 +#include <asm/domain_page.h> 12.107 +#include <asm/ldt.h> 12.108 + 12.109 +#ifdef VERBOSE 12.110 +#define MEM_LOG(_f, _a...) \ 12.111 + printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \ 12.112 + current->domain->id , __LINE__ , ## _a ) 12.113 +#else 12.114 +#define MEM_LOG(_f, _a...) ((void)0) 12.115 +#endif 12.116 + 12.117 +static int alloc_l2_table(struct pfn_info *page); 12.118 +static int alloc_l1_table(struct pfn_info *page); 12.119 +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d); 12.120 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 12.121 + u32 type, 12.122 + struct domain *d); 12.123 + 12.124 +static void free_l2_table(struct pfn_info *page); 12.125 +static void free_l1_table(struct pfn_info *page); 12.126 + 12.127 +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); 12.128 +static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); 12.129 + 12.130 +/* Used to defer flushing of memory structures. */ 12.131 +static struct { 12.132 +#define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */ 12.133 +#define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */ 12.134 + unsigned long deferred_ops; 12.135 + /* If non-NULL, specifies a foreign subject domain for some operations. */ 12.136 + struct domain *foreign; 12.137 +} __cacheline_aligned percpu_info[NR_CPUS]; 12.138 + 12.139 +/* 12.140 + * Returns the current foreign domain; defaults to the currently-executing 12.141 + * domain if a foreign override hasn't been specified. 12.142 + */ 12.143 +#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain) 12.144 + 12.145 +/* Private domain structs for DOMID_XEN and DOMID_IO. */ 12.146 +static struct domain *dom_xen, *dom_io; 12.147 + 12.148 +/* Frame table and its size in pages. */ 12.149 +struct pfn_info *frame_table; 12.150 +unsigned long frame_table_size; 12.151 +unsigned long max_page; 12.152 + 12.153 +void __init init_frametable(void) 12.154 +{ 12.155 + unsigned long i, p; 12.156 + 12.157 + frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START; 12.158 + frame_table_size = max_page * sizeof(struct pfn_info); 12.159 + frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; 12.160 + 12.161 + for ( i = 0; i < frame_table_size; i += (4UL << 20) ) 12.162 + { 12.163 + p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20); 12.164 + if ( p == 0 ) 12.165 + panic("Not enough memory for frame table\n"); 12.166 + map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 12.167 + 4UL << 20, PAGE_HYPERVISOR); 12.168 + } 12.169 + 12.170 + memset(frame_table, 0, frame_table_size); 12.171 +} 12.172 + 12.173 +void arch_init_memory(void) 12.174 +{ 12.175 + extern void subarch_init_memory(struct domain *); 12.176 + 12.177 + memset(percpu_info, 0, sizeof(percpu_info)); 12.178 + 12.179 + /* 12.180 + * Initialise our DOMID_XEN domain. 12.181 + * Any Xen-heap pages that we will allow to be mapped will have 12.182 + * their domain field set to dom_xen. 12.183 + */ 12.184 + dom_xen = alloc_domain_struct(); 12.185 + atomic_set(&dom_xen->refcnt, 1); 12.186 + dom_xen->id = DOMID_XEN; 12.187 + 12.188 + /* 12.189 + * Initialise our DOMID_IO domain. 12.190 + * This domain owns no pages but is considered a special case when 12.191 + * mapping I/O pages, as the mappings occur at the priv of the caller. 12.192 + */ 12.193 + dom_io = alloc_domain_struct(); 12.194 + atomic_set(&dom_io->refcnt, 1); 12.195 + dom_io->id = DOMID_IO; 12.196 + 12.197 + subarch_init_memory(dom_xen); 12.198 +} 12.199 + 12.200 +void write_ptbase(struct exec_domain *ed) 12.201 +{ 12.202 + struct domain *d = ed->domain; 12.203 + unsigned long pa; 12.204 + 12.205 +#ifdef CONFIG_VMX 12.206 + if ( unlikely(shadow_mode(d)) ) 12.207 + pa = ((shadow_mode(d) == SHM_full_32) ? 12.208 + pagetable_val(ed->arch.monitor_table) : 12.209 + pagetable_val(ed->arch.shadow_table)); 12.210 + else 12.211 + pa = pagetable_val(ed->arch.pagetable); 12.212 +#else 12.213 + if ( unlikely(shadow_mode(d)) ) 12.214 + pa = pagetable_val(ed->arch.shadow_table); 12.215 + else 12.216 + pa = pagetable_val(ed->arch.pagetable); 12.217 +#endif 12.218 + 12.219 + write_cr3(pa); 12.220 +} 12.221 + 12.222 +static void __invalidate_shadow_ldt(struct exec_domain *d) 12.223 +{ 12.224 + int i; 12.225 + unsigned long pfn; 12.226 + struct pfn_info *page; 12.227 + 12.228 + d->arch.shadow_ldt_mapcnt = 0; 12.229 + 12.230 + for ( i = 16; i < 32; i++ ) 12.231 + { 12.232 + pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]); 12.233 + if ( pfn == 0 ) continue; 12.234 + d->arch.perdomain_ptes[i] = mk_l1_pgentry(0); 12.235 + page = &frame_table[pfn]; 12.236 + ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page); 12.237 + ASSERT_PAGE_IS_DOMAIN(page, d->domain); 12.238 + put_page_and_type(page); 12.239 + } 12.240 + 12.241 + /* Dispose of the (now possibly invalid) mappings from the TLB. */ 12.242 + percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT; 12.243 +} 12.244 + 12.245 + 12.246 +static inline void invalidate_shadow_ldt(struct exec_domain *d) 12.247 +{ 12.248 + if ( d->arch.shadow_ldt_mapcnt != 0 ) 12.249 + __invalidate_shadow_ldt(d); 12.250 +} 12.251 + 12.252 + 12.253 +static int alloc_segdesc_page(struct pfn_info *page) 12.254 +{ 12.255 + struct desc_struct *descs; 12.256 + int i; 12.257 + 12.258 + descs = map_domain_mem((page-frame_table) << PAGE_SHIFT); 12.259 + 12.260 + for ( i = 0; i < 512; i++ ) 12.261 + if ( unlikely(!check_descriptor(&descs[i])) ) 12.262 + goto fail; 12.263 + 12.264 + unmap_domain_mem(descs); 12.265 + return 1; 12.266 + 12.267 + fail: 12.268 + unmap_domain_mem(descs); 12.269 + return 0; 12.270 +} 12.271 + 12.272 + 12.273 +/* Map shadow page at offset @off. */ 12.274 +int map_ldt_shadow_page(unsigned int off) 12.275 +{ 12.276 + struct exec_domain *ed = current; 12.277 + struct domain *d = ed->domain; 12.278 + unsigned long l1e; 12.279 + 12.280 + if ( unlikely(in_irq()) ) 12.281 + BUG(); 12.282 + 12.283 + __get_user(l1e, (unsigned long *) 12.284 + &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]); 12.285 + 12.286 + if ( unlikely(!(l1e & _PAGE_PRESENT)) || 12.287 + unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 12.288 + d, PGT_ldt_page)) ) 12.289 + return 0; 12.290 + 12.291 + ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW); 12.292 + ed->arch.shadow_ldt_mapcnt++; 12.293 + 12.294 + return 1; 12.295 +} 12.296 + 12.297 + 12.298 +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d) 12.299 +{ 12.300 + struct pfn_info *page = &frame_table[page_nr]; 12.301 + 12.302 + if ( unlikely(!pfn_is_ram(page_nr)) ) 12.303 + { 12.304 + MEM_LOG("Pfn %08lx is not RAM", page_nr); 12.305 + return 0; 12.306 + } 12.307 + 12.308 + if ( unlikely(!get_page(page, d)) ) 12.309 + { 12.310 + MEM_LOG("Could not get page ref for pfn %08lx", page_nr); 12.311 + return 0; 12.312 + } 12.313 + 12.314 + return 1; 12.315 +} 12.316 + 12.317 + 12.318 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 12.319 + u32 type, 12.320 + struct domain *d) 12.321 +{ 12.322 + struct pfn_info *page = &frame_table[page_nr]; 12.323 + 12.324 + if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) 12.325 + return 0; 12.326 + 12.327 + if ( unlikely(!get_page_type(page, type)) ) 12.328 + { 12.329 +#ifdef VERBOSE 12.330 + if ( (type & PGT_type_mask) != PGT_l1_page_table ) 12.331 + MEM_LOG("Bad page type for pfn %08lx (%08x)", 12.332 + page_nr, page->u.inuse.type_info); 12.333 +#endif 12.334 + put_page(page); 12.335 + return 0; 12.336 + } 12.337 + 12.338 + return 1; 12.339 +} 12.340 + 12.341 + 12.342 +/* 12.343 + * We allow an L2 tables to map each other (a.k.a. linear page tables). It 12.344 + * needs some special care with reference counst and access permissions: 12.345 + * 1. The mapping entry must be read-only, or the guest may get write access 12.346 + * to its own PTEs. 12.347 + * 2. We must only bump the reference counts for an *already validated* 12.348 + * L2 table, or we can end up in a deadlock in get_page_type() by waiting 12.349 + * on a validation that is required to complete that validation. 12.350 + * 3. We only need to increment the reference counts for the mapped page 12.351 + * frame if it is mapped by a different L2 table. This is sufficient and 12.352 + * also necessary to allow validation of an L2 table mapping itself. 12.353 + */ 12.354 +static int 12.355 +get_linear_pagetable( 12.356 + l2_pgentry_t l2e, unsigned long pfn, struct domain *d) 12.357 +{ 12.358 + u32 x, y; 12.359 + struct pfn_info *page; 12.360 + 12.361 + if ( (l2_pgentry_val(l2e) & _PAGE_RW) ) 12.362 + { 12.363 + MEM_LOG("Attempt to create linear p.t. with write perms"); 12.364 + return 0; 12.365 + } 12.366 + 12.367 + if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) 12.368 + { 12.369 + /* Make sure the mapped frame belongs to the correct domain. */ 12.370 + if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) ) 12.371 + return 0; 12.372 + 12.373 + /* 12.374 + * Make sure that the mapped frame is an already-validated L2 table. 12.375 + * If so, atomically increment the count (checking for overflow). 12.376 + */ 12.377 + page = &frame_table[l2_pgentry_to_pagenr(l2e)]; 12.378 + y = page->u.inuse.type_info; 12.379 + do { 12.380 + x = y; 12.381 + if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || 12.382 + unlikely((x & (PGT_type_mask|PGT_validated)) != 12.383 + (PGT_l2_page_table|PGT_validated)) ) 12.384 + { 12.385 + put_page(page); 12.386 + return 0; 12.387 + } 12.388 + } 12.389 + while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); 12.390 + } 12.391 + 12.392 + return 1; 12.393 +} 12.394 + 12.395 + 12.396 +static int 12.397 +get_page_from_l1e( 12.398 + l1_pgentry_t l1e, struct domain *d) 12.399 +{ 12.400 + unsigned long l1v = l1_pgentry_val(l1e); 12.401 + unsigned long pfn = l1_pgentry_to_pagenr(l1e); 12.402 + struct pfn_info *page = &frame_table[pfn]; 12.403 + extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn); 12.404 + 12.405 + if ( !(l1v & _PAGE_PRESENT) ) 12.406 + return 1; 12.407 + 12.408 + if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) ) 12.409 + { 12.410 + MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT)); 12.411 + return 0; 12.412 + } 12.413 + 12.414 + if ( unlikely(!pfn_is_ram(pfn)) ) 12.415 + { 12.416 + /* Revert to caller privileges if FD == DOMID_IO. */ 12.417 + if ( d == dom_io ) 12.418 + d = current->domain; 12.419 + 12.420 + if ( IS_PRIV(d) ) 12.421 + return 1; 12.422 + 12.423 + if ( IS_CAPABLE_PHYSDEV(d) ) 12.424 + return domain_iomem_in_pfn(d, pfn); 12.425 + 12.426 + MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn); 12.427 + return 0; 12.428 + } 12.429 + 12.430 + return ((l1v & _PAGE_RW) ? 12.431 + get_page_and_type(page, d, PGT_writable_page) : 12.432 + get_page(page, d)); 12.433 +} 12.434 + 12.435 + 12.436 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ 12.437 +static int 12.438 +get_page_from_l2e( 12.439 + l2_pgentry_t l2e, unsigned long pfn, 12.440 + struct domain *d, unsigned long va_idx) 12.441 +{ 12.442 + int rc; 12.443 + 12.444 + if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) 12.445 + return 1; 12.446 + 12.447 + if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) ) 12.448 + { 12.449 + MEM_LOG("Bad L2 page type settings %04lx", 12.450 + l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE)); 12.451 + return 0; 12.452 + } 12.453 + 12.454 + rc = get_page_and_type_from_pagenr( 12.455 + l2_pgentry_to_pagenr(l2e), 12.456 + PGT_l1_page_table | (va_idx<<PGT_va_shift), d); 12.457 + 12.458 + if ( unlikely(!rc) ) 12.459 + return get_linear_pagetable(l2e, pfn, d); 12.460 + 12.461 + return 1; 12.462 +} 12.463 + 12.464 + 12.465 +static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) 12.466 +{ 12.467 + unsigned long l1v = l1_pgentry_val(l1e); 12.468 + unsigned long pfn = l1_pgentry_to_pagenr(l1e); 12.469 + struct pfn_info *page = &frame_table[pfn]; 12.470 + struct domain *e; 12.471 + 12.472 + if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) ) 12.473 + return; 12.474 + 12.475 + e = page_get_owner(page); 12.476 + if ( unlikely(e != d) ) 12.477 + { 12.478 + /* 12.479 + * Unmap a foreign page that may have been mapped via a grant table. 12.480 + * Note that this can fail for a privileged domain that can map foreign 12.481 + * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings 12.482 + * counted via a grant entry and some counted directly in the page 12.483 + * structure's reference count. Note that reference counts won't get 12.484 + * dangerously confused as long as we always try to decrement the 12.485 + * grant entry first. We may end up with a mismatch between which 12.486 + * mappings and which unmappings are counted via the grant entry, but 12.487 + * really it doesn't matter as privileged domains have carte blanche. 12.488 + */ 12.489 + if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) ) 12.490 + return; 12.491 + /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */ 12.492 + } 12.493 + 12.494 + if ( l1v & _PAGE_RW ) 12.495 + { 12.496 + put_page_and_type(page); 12.497 + } 12.498 + else 12.499 + { 12.500 + /* We expect this is rare so we blow the entire shadow LDT. */ 12.501 + if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 12.502 + PGT_ldt_page)) && 12.503 + unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) ) 12.504 + invalidate_shadow_ldt(e->exec_domain[0]); 12.505 + put_page(page); 12.506 + } 12.507 +} 12.508 + 12.509 + 12.510 +/* 12.511 + * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. 12.512 + * Note also that this automatically deals correctly with linear p.t.'s. 12.513 + */ 12.514 +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) 12.515 +{ 12.516 + if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 12.517 + ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) ) 12.518 + put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]); 12.519 +} 12.520 + 12.521 + 12.522 +static int alloc_l2_table(struct pfn_info *page) 12.523 +{ 12.524 + struct domain *d = page_get_owner(page); 12.525 + unsigned long page_nr = page_to_pfn(page); 12.526 + l2_pgentry_t *pl2e; 12.527 + int i; 12.528 + 12.529 + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 12.530 + 12.531 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 12.532 + if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) ) 12.533 + goto fail; 12.534 + 12.535 +#if defined(__i386__) 12.536 + /* Now we add our private high mappings. */ 12.537 + memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 12.538 + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 12.539 + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); 12.540 + pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = 12.541 + mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); 12.542 + pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = 12.543 + mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 12.544 + __PAGE_HYPERVISOR); 12.545 +#endif 12.546 + 12.547 + unmap_domain_mem(pl2e); 12.548 + return 1; 12.549 + 12.550 + fail: 12.551 + while ( i-- > 0 ) 12.552 + put_page_from_l2e(pl2e[i], page_nr); 12.553 + 12.554 + unmap_domain_mem(pl2e); 12.555 + return 0; 12.556 +} 12.557 + 12.558 + 12.559 +static int alloc_l1_table(struct pfn_info *page) 12.560 +{ 12.561 + struct domain *d = page_get_owner(page); 12.562 + unsigned long page_nr = page_to_pfn(page); 12.563 + l1_pgentry_t *pl1e; 12.564 + int i; 12.565 + 12.566 + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 12.567 + 12.568 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 12.569 + if ( unlikely(!get_page_from_l1e(pl1e[i], d)) ) 12.570 + goto fail; 12.571 + 12.572 + unmap_domain_mem(pl1e); 12.573 + return 1; 12.574 + 12.575 + fail: 12.576 + while ( i-- > 0 ) 12.577 + put_page_from_l1e(pl1e[i], d); 12.578 + 12.579 + unmap_domain_mem(pl1e); 12.580 + return 0; 12.581 +} 12.582 + 12.583 + 12.584 +static void free_l2_table(struct pfn_info *page) 12.585 +{ 12.586 + unsigned long page_nr = page - frame_table; 12.587 + l2_pgentry_t *pl2e; 12.588 + int i; 12.589 + 12.590 + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 12.591 + 12.592 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 12.593 + put_page_from_l2e(pl2e[i], page_nr); 12.594 + 12.595 + unmap_domain_mem(pl2e); 12.596 +} 12.597 + 12.598 + 12.599 +static void free_l1_table(struct pfn_info *page) 12.600 +{ 12.601 + struct domain *d = page_get_owner(page); 12.602 + unsigned long page_nr = page - frame_table; 12.603 + l1_pgentry_t *pl1e; 12.604 + int i; 12.605 + 12.606 + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 12.607 + 12.608 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 12.609 + put_page_from_l1e(pl1e[i], d); 12.610 + 12.611 + unmap_domain_mem(pl1e); 12.612 +} 12.613 + 12.614 + 12.615 +static inline int update_l2e(l2_pgentry_t *pl2e, 12.616 + l2_pgentry_t ol2e, 12.617 + l2_pgentry_t nl2e) 12.618 +{ 12.619 + unsigned long o = cmpxchg((unsigned long *)pl2e, 12.620 + l2_pgentry_val(ol2e), 12.621 + l2_pgentry_val(nl2e)); 12.622 + if ( o != l2_pgentry_val(ol2e) ) 12.623 + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 12.624 + l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o); 12.625 + return (o == l2_pgentry_val(ol2e)); 12.626 +} 12.627 + 12.628 + 12.629 +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ 12.630 +static int mod_l2_entry(l2_pgentry_t *pl2e, 12.631 + l2_pgentry_t nl2e, 12.632 + unsigned long pfn) 12.633 +{ 12.634 + l2_pgentry_t ol2e; 12.635 + unsigned long _ol2e; 12.636 + 12.637 + if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >= 12.638 + DOMAIN_ENTRIES_PER_L2_PAGETABLE) ) 12.639 + { 12.640 + MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e); 12.641 + return 0; 12.642 + } 12.643 + 12.644 + if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) ) 12.645 + return 0; 12.646 + ol2e = mk_l2_pgentry(_ol2e); 12.647 + 12.648 + if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT ) 12.649 + { 12.650 + /* Differ in mapping (bits 12-31) or presence (bit 0)? */ 12.651 + if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 ) 12.652 + return update_l2e(pl2e, ol2e, nl2e); 12.653 + 12.654 + if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, 12.655 + ((unsigned long)pl2e & 12.656 + ~PAGE_MASK) >> 2)) ) 12.657 + return 0; 12.658 + 12.659 + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 12.660 + { 12.661 + put_page_from_l2e(nl2e, pfn); 12.662 + return 0; 12.663 + } 12.664 + 12.665 + put_page_from_l2e(ol2e, pfn); 12.666 + return 1; 12.667 + } 12.668 + 12.669 + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 12.670 + return 0; 12.671 + 12.672 + put_page_from_l2e(ol2e, pfn); 12.673 + return 1; 12.674 +} 12.675 + 12.676 + 12.677 +static inline int update_l1e(l1_pgentry_t *pl1e, 12.678 + l1_pgentry_t ol1e, 12.679 + l1_pgentry_t nl1e) 12.680 +{ 12.681 + unsigned long o = l1_pgentry_val(ol1e); 12.682 + unsigned long n = l1_pgentry_val(nl1e); 12.683 + 12.684 + if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) || 12.685 + unlikely(o != l1_pgentry_val(ol1e)) ) 12.686 + { 12.687 + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 12.688 + l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o); 12.689 + return 0; 12.690 + } 12.691 + 12.692 + return 1; 12.693 +} 12.694 + 12.695 + 12.696 +/* Update the L1 entry at pl1e to new value nl1e. */ 12.697 +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) 12.698 +{ 12.699 + l1_pgentry_t ol1e; 12.700 + unsigned long _ol1e; 12.701 + struct domain *d = current->domain; 12.702 + 12.703 + if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) 12.704 + { 12.705 + MEM_LOG("Bad get_user\n"); 12.706 + return 0; 12.707 + } 12.708 + 12.709 + ol1e = mk_l1_pgentry(_ol1e); 12.710 + 12.711 + if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT ) 12.712 + { 12.713 + /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */ 12.714 + if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 ) 12.715 + return update_l1e(pl1e, ol1e, nl1e); 12.716 + 12.717 + if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) 12.718 + return 0; 12.719 + 12.720 + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 12.721 + { 12.722 + put_page_from_l1e(nl1e, d); 12.723 + return 0; 12.724 + } 12.725 + 12.726 + put_page_from_l1e(ol1e, d); 12.727 + return 1; 12.728 + } 12.729 + 12.730 + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 12.731 + return 0; 12.732 + 12.733 + put_page_from_l1e(ol1e, d); 12.734 + return 1; 12.735 +} 12.736 + 12.737 + 12.738 +int alloc_page_type(struct pfn_info *page, unsigned int type) 12.739 +{ 12.740 + switch ( type ) 12.741 + { 12.742 + case PGT_l1_page_table: 12.743 + return alloc_l1_table(page); 12.744 + case PGT_l2_page_table: 12.745 + return alloc_l2_table(page); 12.746 + case PGT_gdt_page: 12.747 + case PGT_ldt_page: 12.748 + return alloc_segdesc_page(page); 12.749 + default: 12.750 + printk("Bad type in alloc_page_type %x t=%x c=%x\n", 12.751 + type, page->u.inuse.type_info, 12.752 + page->count_info); 12.753 + BUG(); 12.754 + } 12.755 + 12.756 + return 0; 12.757 +} 12.758 + 12.759 + 12.760 +void free_page_type(struct pfn_info *page, unsigned int type) 12.761 +{ 12.762 + struct domain *d = page_get_owner(page); 12.763 + 12.764 + switch ( type ) 12.765 + { 12.766 + case PGT_l1_page_table: 12.767 + free_l1_table(page); 12.768 + break; 12.769 + 12.770 + case PGT_l2_page_table: 12.771 + free_l2_table(page); 12.772 + break; 12.773 + 12.774 + default: 12.775 + BUG(); 12.776 + } 12.777 + 12.778 + if ( unlikely(shadow_mode(d)) && 12.779 + (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) ) 12.780 + { 12.781 + unshadow_table(page_to_pfn(page), type); 12.782 + put_shadow_status(d); 12.783 + } 12.784 +} 12.785 + 12.786 + 12.787 +void put_page_type(struct pfn_info *page) 12.788 +{ 12.789 + u32 nx, x, y = page->u.inuse.type_info; 12.790 + 12.791 + again: 12.792 + do { 12.793 + x = y; 12.794 + nx = x - 1; 12.795 + 12.796 + ASSERT((x & PGT_count_mask) != 0); 12.797 + 12.798 + /* 12.799 + * The page should always be validated while a reference is held. The 12.800 + * exception is during domain destruction, when we forcibly invalidate 12.801 + * page-table pages if we detect a referential loop. 12.802 + * See domain.c:relinquish_list(). 12.803 + */ 12.804 + ASSERT((x & PGT_validated) || 12.805 + test_bit(DF_DYING, &page_get_owner(page)->d_flags)); 12.806 + 12.807 + if ( unlikely((nx & PGT_count_mask) == 0) ) 12.808 + { 12.809 + /* Record TLB information for flush later. Races are harmless. */ 12.810 + page->tlbflush_timestamp = tlbflush_current_time(); 12.811 + 12.812 + if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && 12.813 + likely(nx & PGT_validated) ) 12.814 + { 12.815 + /* 12.816 + * Page-table pages must be unvalidated when count is zero. The 12.817 + * 'free' is safe because the refcnt is non-zero and validated 12.818 + * bit is clear => other ops will spin or fail. 12.819 + */ 12.820 + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 12.821 + x & ~PGT_validated)) != x) ) 12.822 + goto again; 12.823 + /* We cleared the 'valid bit' so we do the clear up. */ 12.824 + free_page_type(page, x & PGT_type_mask); 12.825 + /* Carry on, but with the 'valid bit' now clear. */ 12.826 + x &= ~PGT_validated; 12.827 + nx &= ~PGT_validated; 12.828 + } 12.829 + } 12.830 + else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 12.831 + (PGT_pinned | 1)) ) 12.832 + { 12.833 + /* Page is now only pinned. Make the back pointer mutable again. */ 12.834 + nx |= PGT_va_mutable; 12.835 + } 12.836 + } 12.837 + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 12.838 +} 12.839 + 12.840 + 12.841 +int get_page_type(struct pfn_info *page, u32 type) 12.842 +{ 12.843 + u32 nx, x, y = page->u.inuse.type_info; 12.844 + 12.845 + again: 12.846 + do { 12.847 + x = y; 12.848 + nx = x + 1; 12.849 + if ( unlikely((nx & PGT_count_mask) == 0) ) 12.850 + { 12.851 + MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page)); 12.852 + return 0; 12.853 + } 12.854 + else if ( unlikely((x & PGT_count_mask) == 0) ) 12.855 + { 12.856 + if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) 12.857 + { 12.858 + /* 12.859 + * On type change we check to flush stale TLB entries. This 12.860 + * may be unnecessary (e.g., page was GDT/LDT) but those 12.861 + * circumstances should be very rare. 12.862 + */ 12.863 + struct domain *d = page_get_owner(page); 12.864 + if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor], 12.865 + page->tlbflush_timestamp)) ) 12.866 + { 12.867 + perfc_incr(need_flush_tlb_flush); 12.868 + flush_tlb_cpu(d->exec_domain[0]->processor); 12.869 + } 12.870 + 12.871 + /* We lose existing type, back pointer, and validity. */ 12.872 + nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); 12.873 + nx |= type; 12.874 + 12.875 + /* No special validation needed for writable pages. */ 12.876 + /* Page tables and GDT/LDT need to be scanned for validity. */ 12.877 + if ( type == PGT_writable_page ) 12.878 + nx |= PGT_validated; 12.879 + } 12.880 + } 12.881 + else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) 12.882 + { 12.883 + if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) 12.884 + { 12.885 + if ( ((x & PGT_type_mask) != PGT_l2_page_table) || 12.886 + ((type & PGT_type_mask) != PGT_l1_page_table) ) 12.887 + MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n", 12.888 + x & PGT_type_mask, type, page_to_pfn(page)); 12.889 + return 0; 12.890 + } 12.891 + else if ( (x & PGT_va_mask) == PGT_va_mutable ) 12.892 + { 12.893 + /* The va backpointer is mutable, hence we update it. */ 12.894 + nx &= ~PGT_va_mask; 12.895 + nx |= type; /* we know the actual type is correct */ 12.896 + } 12.897 + else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) ) 12.898 + { 12.899 + /* This table is potentially mapped at multiple locations. */ 12.900 + nx &= ~PGT_va_mask; 12.901 + nx |= PGT_va_unknown; 12.902 + } 12.903 + } 12.904 + else if ( unlikely(!(x & PGT_validated)) ) 12.905 + { 12.906 + /* Someone else is updating validation of this page. Wait... */ 12.907 + while ( (y = page->u.inuse.type_info) == x ) 12.908 + { 12.909 + rep_nop(); 12.910 + barrier(); 12.911 + } 12.912 + goto again; 12.913 + } 12.914 + } 12.915 + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 12.916 + 12.917 + if ( unlikely(!(nx & PGT_validated)) ) 12.918 + { 12.919 + /* Try to validate page type; drop the new reference on failure. */ 12.920 + if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) 12.921 + { 12.922 + MEM_LOG("Error while validating pfn %08lx for type %08x." 12.923 + " caf=%08x taf=%08x\n", 12.924 + page_to_pfn(page), type, 12.925 + page->count_info, 12.926 + page->u.inuse.type_info); 12.927 + /* Noone else can get a reference. We hold the only ref. */ 12.928 + page->u.inuse.type_info = 0; 12.929 + return 0; 12.930 + } 12.931 + 12.932 + /* Noone else is updating simultaneously. */ 12.933 + __set_bit(_PGT_validated, &page->u.inuse.type_info); 12.934 + } 12.935 + 12.936 + return 1; 12.937 +} 12.938 + 12.939 + 12.940 +int new_guest_cr3(unsigned long pfn) 12.941 +{ 12.942 + struct exec_domain *ed = current; 12.943 + struct domain *d = ed->domain; 12.944 + int okay, cpu = smp_processor_id(); 12.945 + unsigned long old_base_pfn; 12.946 + 12.947 + okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d); 12.948 + if ( likely(okay) ) 12.949 + { 12.950 + invalidate_shadow_ldt(ed); 12.951 + 12.952 + percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB; 12.953 + old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT; 12.954 + ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT); 12.955 + 12.956 + shadow_mk_pagetable(ed); 12.957 + 12.958 + write_ptbase(ed); 12.959 + 12.960 + put_page_and_type(&frame_table[old_base_pfn]); 12.961 + } 12.962 + else 12.963 + { 12.964 + MEM_LOG("Error while installing new baseptr %08lx", pfn); 12.965 + } 12.966 + 12.967 + return okay; 12.968 +} 12.969 + 12.970 +static int do_extended_command(unsigned long ptr, unsigned long val) 12.971 +{ 12.972 + int okay = 1, cpu = smp_processor_id(); 12.973 + unsigned int cmd = val & MMUEXT_CMD_MASK; 12.974 + unsigned long pfn = ptr >> PAGE_SHIFT; 12.975 + struct pfn_info *page = &frame_table[pfn]; 12.976 + struct exec_domain *ed = current; 12.977 + struct domain *d = ed->domain, *nd, *e; 12.978 + u32 x, y; 12.979 + domid_t domid; 12.980 + grant_ref_t gntref; 12.981 + 12.982 + switch ( cmd ) 12.983 + { 12.984 + case MMUEXT_PIN_L1_TABLE: 12.985 + case MMUEXT_PIN_L2_TABLE: 12.986 + /* 12.987 + * We insist that, if you pin an L1 page, it's the first thing that 12.988 + * you do to it. This is because we require the backptr to still be 12.989 + * mutable. This assumption seems safe. 12.990 + */ 12.991 + okay = get_page_and_type_from_pagenr( 12.992 + pfn, 12.993 + ((cmd==MMUEXT_PIN_L2_TABLE) ? 12.994 + PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)), 12.995 + FOREIGNDOM); 12.996 + 12.997 + if ( unlikely(!okay) ) 12.998 + { 12.999 + MEM_LOG("Error while pinning pfn %08lx", pfn); 12.1000 + break; 12.1001 + } 12.1002 + 12.1003 + if ( unlikely(test_and_set_bit(_PGT_pinned, 12.1004 + &page->u.inuse.type_info)) ) 12.1005 + { 12.1006 + MEM_LOG("Pfn %08lx already pinned", pfn); 12.1007 + put_page_and_type(page); 12.1008 + okay = 0; 12.1009 + break; 12.1010 + } 12.1011 + 12.1012 + break; 12.1013 + 12.1014 + case MMUEXT_UNPIN_TABLE: 12.1015 + if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) ) 12.1016 + { 12.1017 + MEM_LOG("Page %08lx bad domain (dom=%p)", 12.1018 + ptr, page_get_owner(page)); 12.1019 + } 12.1020 + else if ( likely(test_and_clear_bit(_PGT_pinned, 12.1021 + &page->u.inuse.type_info)) ) 12.1022 + { 12.1023 + put_page_and_type(page); 12.1024 + put_page(page); 12.1025 + } 12.1026 + else 12.1027 + { 12.1028 + okay = 0; 12.1029 + put_page(page); 12.1030 + MEM_LOG("Pfn %08lx not pinned", pfn); 12.1031 + } 12.1032 + break; 12.1033 + 12.1034 + case MMUEXT_NEW_BASEPTR: 12.1035 + okay = new_guest_cr3(pfn); 12.1036 + break; 12.1037 + 12.1038 + case MMUEXT_TLB_FLUSH: 12.1039 + percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB; 12.1040 + break; 12.1041 + 12.1042 + case MMUEXT_INVLPG: 12.1043 + __flush_tlb_one(ptr); 12.1044 + break; 12.1045 + 12.1046 + case MMUEXT_FLUSH_CACHE: 12.1047 + if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) ) 12.1048 + { 12.1049 + MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n"); 12.1050 + okay = 0; 12.1051 + } 12.1052 + else 12.1053 + { 12.1054 + wbinvd(); 12.1055 + } 12.1056 + break; 12.1057 + 12.1058 + case MMUEXT_SET_LDT: 12.1059 + { 12.1060 + unsigned long ents = val >> MMUEXT_CMD_SHIFT; 12.1061 + if ( ((ptr & (PAGE_SIZE-1)) != 0) || 12.1062 + (ents > 8192) || 12.1063 + ((ptr+ents*LDT_ENTRY_SIZE) < ptr) || 12.1064 + ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) ) 12.1065 + { 12.1066 + okay = 0; 12.1067 + MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents); 12.1068 + } 12.1069 + else if ( (ed->arch.ldt_ents != ents) || 12.1070 + (ed->arch.ldt_base != ptr) ) 12.1071 + { 12.1072 + invalidate_shadow_ldt(ed); 12.1073 + ed->arch.ldt_base = ptr; 12.1074 + ed->arch.ldt_ents = ents; 12.1075 + load_LDT(ed); 12.1076 + percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT; 12.1077 + if ( ents != 0 ) 12.1078 + percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT; 12.1079 + } 12.1080 + break; 12.1081 + } 12.1082 + 12.1083 + case MMUEXT_SET_FOREIGNDOM: 12.1084 + domid = (domid_t)(val >> 16); 12.1085 + 12.1086 + if ( (e = percpu_info[cpu].foreign) != NULL ) 12.1087 + put_domain(e); 12.1088 + percpu_info[cpu].foreign = NULL; 12.1089 + 12.1090 + if ( !IS_PRIV(d) ) 12.1091 + { 12.1092 + switch ( domid ) 12.1093 + { 12.1094 + case DOMID_IO: 12.1095 + get_knownalive_domain(dom_io); 12.1096 + percpu_info[cpu].foreign = dom_io; 12.1097 + break; 12.1098 + default: 12.1099 + MEM_LOG("Dom %u cannot set foreign dom\n", d->id); 12.1100 + okay = 0; 12.1101 + break; 12.1102 + } 12.1103 + } 12.1104 + else 12.1105 + { 12.1106 + percpu_info[cpu].foreign = e = find_domain_by_id(domid); 12.1107 + if ( e == NULL ) 12.1108 + { 12.1109 + switch ( domid ) 12.1110 + { 12.1111 + case DOMID_XEN: 12.1112 + get_knownalive_domain(dom_xen); 12.1113 + percpu_info[cpu].foreign = dom_xen; 12.1114 + break; 12.1115 + case DOMID_IO: 12.1116 + get_knownalive_domain(dom_io); 12.1117 + percpu_info[cpu].foreign = dom_io; 12.1118 + break; 12.1119 + default: 12.1120 + MEM_LOG("Unknown domain '%u'", domid); 12.1121 + okay = 0; 12.1122 + break; 12.1123 + } 12.1124 + } 12.1125 + } 12.1126 + break; 12.1127 + 12.1128 + case MMUEXT_TRANSFER_PAGE: 12.1129 + domid = (domid_t)(val >> 16); 12.1130 + gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF)); 12.1131 + 12.1132 + if ( unlikely(IS_XEN_HEAP_FRAME(page)) || 12.1133 + unlikely(!pfn_is_ram(pfn)) || 12.1134 + unlikely((e = find_domain_by_id(domid)) == NULL) ) 12.1135 + { 12.1136 + MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid); 12.1137 + okay = 0; 12.1138 + break; 12.1139 + } 12.1140 + 12.1141 + spin_lock(&d->page_alloc_lock); 12.1142 + 12.1143 + /* 12.1144 + * The tricky bit: atomically release ownership while there is just one 12.1145 + * benign reference to the page (PGC_allocated). If that reference 12.1146 + * disappears then the deallocation routine will safely spin. 12.1147 + */ 12.1148 + nd = page_get_owner(page); 12.1149 + y = page->count_info; 12.1150 + do { 12.1151 + x = y; 12.1152 + if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 12.1153 + (1|PGC_allocated)) || 12.1154 + unlikely(nd != d) ) 12.1155 + { 12.1156 + MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," 12.1157 + " caf=%08x, taf=%08x\n", page_to_pfn(page), 12.1158 + d, d->id, nd, x, page->u.inuse.type_info); 12.1159 + spin_unlock(&d->page_alloc_lock); 12.1160 + put_domain(e); 12.1161 + return 0; 12.1162 + } 12.1163 + __asm__ __volatile__( 12.1164 + LOCK_PREFIX "cmpxchg8b %2" 12.1165 + : "=d" (nd), "=a" (y), 12.1166 + "=m" (*(volatile u64 *)(&page->count_info)) 12.1167 + : "0" (d), "1" (x), "c" (NULL), "b" (x) ); 12.1168 + } 12.1169 + while ( unlikely(nd != d) || unlikely(y != x) ); 12.1170 + 12.1171 + /* 12.1172 + * Unlink from 'd'. At least one reference remains (now anonymous), so 12.1173 + * noone else is spinning to try to delete this page from 'd'. 12.1174 + */ 12.1175 + d->tot_pages--; 12.1176 + list_del(&page->list); 12.1177 + 12.1178 + spin_unlock(&d->page_alloc_lock); 12.1179 + 12.1180 + spin_lock(&e->page_alloc_lock); 12.1181 + 12.1182 + /* 12.1183 + * Check that 'e' will accept the page and has reservation headroom. 12.1184 + * Also, a domain mustn't have PGC_allocated pages when it is dying. 12.1185 + */ 12.1186 + ASSERT(e->tot_pages <= e->max_pages); 12.1187 + if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || 12.1188 + unlikely(e->tot_pages == e->max_pages) || 12.1189 + unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) ) 12.1190 + { 12.1191 + MEM_LOG("Transferee has no reservation headroom (%d,%d), or " 12.1192 + "provided a bad grant ref, or is dying (%08lx).\n", 12.1193 + e->tot_pages, e->max_pages, e->d_flags); 12.1194 + spin_unlock(&e->page_alloc_lock); 12.1195 + put_domain(e); 12.1196 + okay = 0; 12.1197 + break; 12.1198 + } 12.1199 + 12.1200 + /* Okay, add the page to 'e'. */ 12.1201 + if ( unlikely(e->tot_pages++ == 0) ) 12.1202 + get_knownalive_domain(e); 12.1203 + list_add_tail(&page->list, &e->page_list); 12.1204 + page_set_owner(page, e); 12.1205 + 12.1206 + spin_unlock(&e->page_alloc_lock); 12.1207 + 12.1208 + /* Transfer is all done: tell the guest about its new page frame. */ 12.1209 + gnttab_notify_transfer(e, gntref, pfn); 12.1210 + 12.1211 + put_domain(e); 12.1212 + break; 12.1213 + 12.1214 + case MMUEXT_REASSIGN_PAGE: 12.1215 + if ( unlikely(!IS_PRIV(d)) ) 12.1216 + { 12.1217 + MEM_LOG("Dom %u has no reassignment priv", d->id); 12.1218 + okay = 0; 12.1219 + break; 12.1220 + } 12.1221 + 12.1222 + e = percpu_info[cpu].foreign; 12.1223 + if ( unlikely(e == NULL) ) 12.1224 + { 12.1225 + MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn); 12.1226 + okay = 0; 12.1227 + break; 12.1228 + } 12.1229 + 12.1230 + /* 12.1231 + * Grab both page_list locks, in order. This prevents the page from 12.1232 + * disappearing elsewhere while we modify the owner, and we'll need 12.1233 + * both locks if we're successful so that we can change lists. 12.1234 + */ 12.1235 + if ( d < e ) 12.1236 + { 12.1237 + spin_lock(&d->page_alloc_lock); 12.1238 + spin_lock(&e->page_alloc_lock); 12.1239 + } 12.1240 + else 12.1241 + { 12.1242 + spin_lock(&e->page_alloc_lock); 12.1243 + spin_lock(&d->page_alloc_lock); 12.1244 + } 12.1245 + 12.1246 + /* A domain shouldn't have PGC_allocated pages when it is dying. */ 12.1247 + if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || 12.1248 + unlikely(IS_XEN_HEAP_FRAME(page)) ) 12.1249 + { 12.1250 + MEM_LOG("Reassignment page is Xen heap, or dest dom is dying."); 12.1251 + okay = 0; 12.1252 + goto reassign_fail; 12.1253 + } 12.1254 + 12.1255 + /* 12.1256 + * The tricky bit: atomically change owner while there is just one 12.1257 + * benign reference to the page (PGC_allocated). If that reference 12.1258 + * disappears then the deallocation routine will safely spin. 12.1259 + */ 12.1260 + nd = page_get_owner(page); 12.1261 + y = page->count_info; 12.1262 + do { 12.1263 + x = y; 12.1264 + if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 12.1265 + (1|PGC_allocated)) || 12.1266 + unlikely(nd != d) ) 12.1267 + { 12.1268 + MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," 12.1269 + " caf=%08x, taf=%08x\n", page_to_pfn(page), 12.1270 + d, d->id, nd, x, page->u.inuse.type_info); 12.1271 + okay = 0; 12.1272 + goto reassign_fail; 12.1273 + } 12.1274 + __asm__ __volatile__( 12.1275 + LOCK_PREFIX "cmpxchg8b %3" 12.1276 + : "=d" (nd), "=a" (y), "=c" (e), 12.1277 + "=m" (*(volatile u64 *)(&page->count_info)) 12.1278 + : "0" (d), "1" (x), "c" (e), "b" (x) ); 12.1279 + } 12.1280 + while ( unlikely(nd != d) || unlikely(y != x) ); 12.1281 + 12.1282 + /* 12.1283 + * Unlink from 'd'. We transferred at least one reference to 'e', so 12.1284 + * noone else is spinning to try to delete this page from 'd'. 12.1285 + */ 12.1286 + d->tot_pages--; 12.1287 + list_del(&page->list); 12.1288 + 12.1289 + /* 12.1290 + * Add the page to 'e'. Someone may already have removed the last 12.1291 + * reference and want to remove the page from 'e'. However, we have 12.1292 + * the lock so they'll spin waiting for us. 12.1293 + */ 12.1294 + if ( unlikely(e->tot_pages++ == 0) ) 12.1295 + get_knownalive_domain(e); 12.1296 + list_add_tail(&page->list, &e->page_list); 12.1297 + 12.1298 + reassign_fail: 12.1299 + spin_unlock(&d->page_alloc_lock); 12.1300 + spin_unlock(&e->page_alloc_lock); 12.1301 + break; 12.1302 + 12.1303 + case MMUEXT_CLEAR_FOREIGNDOM: 12.1304 + if ( (e = percpu_info[cpu].foreign) != NULL ) 12.1305 + put_domain(e); 12.1306 + percpu_info[cpu].foreign = NULL; 12.1307 + break; 12.1308 + 12.1309 + default: 12.1310 + MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK); 12.1311 + okay = 0; 12.1312 + break; 12.1313 + } 12.1314 + 12.1315 + return okay; 12.1316 +} 12.1317 + 12.1318 +int do_mmu_update( 12.1319 + mmu_update_t *ureqs, unsigned int count, unsigned int *pdone) 12.1320 +{ 12.1321 +/* 12.1322 + * We steal the m.s.b. of the @count parameter to indicate whether this 12.1323 + * invocation of do_mmu_update() is resuming a previously preempted call. 12.1324 + * We steal the next 15 bits to remember the current FOREIGNDOM. 12.1325 + */ 12.1326 +#define MMU_UPDATE_PREEMPTED (~(~0U>>1)) 12.1327 +#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16) 12.1328 +#define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT) 12.1329 + 12.1330 + mmu_update_t req; 12.1331 + unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0; 12.1332 + struct pfn_info *page; 12.1333 + int rc = 0, okay = 1, i = 0, cpu = smp_processor_id(); 12.1334 + unsigned int cmd, done = 0; 12.1335 + unsigned long prev_smfn = 0; 12.1336 + l1_pgentry_t *prev_spl1e = 0; 12.1337 + struct exec_domain *ed = current; 12.1338 + struct domain *d = ed->domain; 12.1339 + u32 type_info; 12.1340 + domid_t domid; 12.1341 + 12.1342 + LOCK_BIGLOCK(d); 12.1343 + 12.1344 + cleanup_writable_pagetable(d); 12.1345 + 12.1346 + if ( unlikely(shadow_mode(d)) ) 12.1347 + check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */ 12.1348 + 12.1349 + /* 12.1350 + * If we are resuming after preemption, read how much work we have already 12.1351 + * done. This allows us to set the @done output parameter correctly. 12.1352 + * We also reset FOREIGNDOM here. 12.1353 + */ 12.1354 + if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) ) 12.1355 + { 12.1356 + if ( !(count & MMU_UPDATE_PREEMPTED) ) 12.1357 + { 12.1358 + /* Count overflow into private FOREIGNDOM field. */ 12.1359 + MEM_LOG("do_mmu_update count is too large"); 12.1360 + rc = -EINVAL; 12.1361 + goto out; 12.1362 + } 12.1363 + count &= ~MMU_UPDATE_PREEMPTED; 12.1364 + domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT; 12.1365 + count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK; 12.1366 + if ( unlikely(pdone != NULL) ) 12.1367 + (void)get_user(done, pdone); 12.1368 + if ( (domid != current->domain->id) && 12.1369 + !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) ) 12.1370 + { 12.1371 + rc = -EINVAL; 12.1372 + goto out; 12.1373 + } 12.1374 + } 12.1375 + 12.1376 + perfc_incrc(calls_to_mmu_update); 12.1377 + perfc_addc(num_page_updates, count); 12.1378 + 12.1379 + if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) ) 12.1380 + { 12.1381 + rc = -EFAULT; 12.1382 + goto out; 12.1383 + } 12.1384 + 12.1385 + for ( i = 0; i < count; i++ ) 12.1386 + { 12.1387 + if ( hypercall_preempt_check() ) 12.1388 + { 12.1389 + rc = hypercall3_create_continuation( 12.1390 + __HYPERVISOR_mmu_update, ureqs, 12.1391 + (count - i) | 12.1392 + (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 12.1393 + MMU_UPDATE_PREEMPTED, pdone); 12.1394 + break; 12.1395 + } 12.1396 + 12.1397 + if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) ) 12.1398 + { 12.1399 + MEM_LOG("Bad __copy_from_user"); 12.1400 + rc = -EFAULT; 12.1401 + break; 12.1402 + } 12.1403 + 12.1404 + cmd = req.ptr & (sizeof(l1_pgentry_t)-1); 12.1405 + pfn = req.ptr >> PAGE_SHIFT; 12.1406 + 12.1407 + okay = 0; 12.1408 + 12.1409 + switch ( cmd ) 12.1410 + { 12.1411 + /* 12.1412 + * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. 12.1413 + */ 12.1414 + case MMU_NORMAL_PT_UPDATE: 12.1415 + if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) ) 12.1416 + { 12.1417 + MEM_LOG("Could not get page for normal update"); 12.1418 + break; 12.1419 + } 12.1420 + 12.1421 + if ( likely(prev_pfn == pfn) ) 12.1422 + { 12.1423 + va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); 12.1424 + } 12.1425 + else 12.1426 + { 12.1427 + if ( prev_pfn != 0 ) 12.1428 + unmap_domain_mem((void *)va); 12.1429 + va = (unsigned long)map_domain_mem(req.ptr); 12.1430 + prev_pfn = pfn; 12.1431 + } 12.1432 + 12.1433 + page = &frame_table[pfn]; 12.1434 + switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) 12.1435 + { 12.1436 + case PGT_l1_page_table: 12.1437 + if ( likely(get_page_type( 12.1438 + page, type_info & (PGT_type_mask|PGT_va_mask))) ) 12.1439 + { 12.1440 + okay = mod_l1_entry((l1_pgentry_t *)va, 12.1441 + mk_l1_pgentry(req.val)); 12.1442 + 12.1443 + if ( unlikely(shadow_mode(d)) && okay && 12.1444 + (get_shadow_status(d, page-frame_table) & 12.1445 + PSH_shadowed) ) 12.1446 + { 12.1447 + shadow_l1_normal_pt_update( 12.1448 + req.ptr, req.val, &prev_smfn, &prev_spl1e); 12.1449 + put_shadow_status(d); 12.1450 + } 12.1451 + 12.1452 + put_page_type(page); 12.1453 + } 12.1454 + break; 12.1455 + case PGT_l2_page_table: 12.1456 + if ( likely(get_page_type(page, PGT_l2_page_table)) ) 12.1457 + { 12.1458 + okay = mod_l2_entry((l2_pgentry_t *)va, 12.1459 + mk_l2_pgentry(req.val), 12.1460 + pfn); 12.1461 + 12.1462 + if ( unlikely(shadow_mode(d)) && okay && 12.1463 + (get_shadow_status(d, page-frame_table) & 12.1464 + PSH_shadowed) ) 12.1465 + { 12.1466 + shadow_l2_normal_pt_update(req.ptr, req.val); 12.1467 + put_shadow_status(d); 12.1468 + } 12.1469 + 12.1470 + put_page_type(page); 12.1471 + } 12.1472 + break; 12.1473 + default: 12.1474 + if ( likely(get_page_type(page, PGT_writable_page)) ) 12.1475 + { 12.1476 + *(unsigned long *)va = req.val; 12.1477 + okay = 1; 12.1478 + put_page_type(page); 12.1479 + } 12.1480 + break; 12.1481 + } 12.1482 + 12.1483 + put_page(page); 12.1484 + break; 12.1485 + 12.1486 + case MMU_MACHPHYS_UPDATE: 12.1487 + if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) ) 12.1488 + { 12.1489 + MEM_LOG("Could not get page for mach->phys update"); 12.1490 + break; 12.1491 + } 12.1492 + 12.1493 + machine_to_phys_mapping[pfn] = req.val; 12.1494 + okay = 1; 12.1495 + 12.1496 + /* 12.1497 + * If in log-dirty mode, mark the corresponding pseudo-physical 12.1498 + * page as dirty. 12.1499 + */ 12.1500 + if ( unlikely(shadow_mode(d) == SHM_logdirty) && 12.1501 + mark_dirty(d, pfn) ) 12.1502 + d->arch.shadow_dirty_block_count++; 12.1503 + 12.1504 + put_page(&frame_table[pfn]); 12.1505 + break; 12.1506 + 12.1507 + /* 12.1508 + * MMU_EXTENDED_COMMAND: Extended command is specified 12.1509 + * in the least-siginificant bits of the 'value' field. 12.1510 + */ 12.1511 + case MMU_EXTENDED_COMMAND: 12.1512 + req.ptr &= ~(sizeof(l1_pgentry_t) - 1); 12.1513 + okay = do_extended_command(req.ptr, req.val); 12.1514 + break; 12.1515 + 12.1516 + default: 12.1517 + MEM_LOG("Invalid page update command %08lx", req.ptr); 12.1518 + break; 12.1519 + } 12.1520 + 12.1521 + if ( unlikely(!okay) ) 12.1522 + { 12.1523 + rc = -EINVAL; 12.1524 + break; 12.1525 + } 12.1526 + 12.1527 + ureqs++; 12.1528 + } 12.1529 + 12.1530 + out: 12.1531 + if ( prev_pfn != 0 ) 12.1532 + unmap_domain_mem((void *)va); 12.1533 + 12.1534 + if ( unlikely(prev_spl1e != 0) ) 12.1535 + unmap_domain_mem((void *)prev_spl1e); 12.1536 + 12.1537 + deferred_ops = percpu_info[cpu].deferred_ops; 12.1538 + percpu_info[cpu].deferred_ops = 0; 12.1539 + 12.1540 + if ( deferred_ops & DOP_FLUSH_TLB ) 12.1541 + local_flush_tlb(); 12.1542 + 12.1543 + if ( deferred_ops & DOP_RELOAD_LDT ) 12.1544 + (void)map_ldt_shadow_page(0); 12.1545 + 12.1546 + if ( unlikely(percpu_info[cpu].foreign != NULL) ) 12.1547 + { 12.1548 + put_domain(percpu_info[cpu].foreign); 12.1549 + percpu_info[cpu].foreign = NULL; 12.1550 + } 12.1551 + 12.1552 + /* Add incremental work we have done to the @done output parameter. */ 12.1553 + if ( unlikely(pdone != NULL) ) 12.1554 + __put_user(done + i, pdone); 12.1555 + 12.1556 + if ( unlikely(shadow_mode(d)) ) 12.1557 + check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */ 12.1558 + 12.1559 + UNLOCK_BIGLOCK(d); 12.1560 + return rc; 12.1561 +} 12.1562 + 12.1563 + 12.1564 +int do_update_va_mapping(unsigned long va, 12.1565 + unsigned long val, 12.1566 + unsigned long flags) 12.1567 +{ 12.1568 + struct exec_domain *ed = current; 12.1569 + struct domain *d = ed->domain; 12.1570 + int err = 0; 12.1571 + unsigned int cpu = ed->processor; 12.1572 + unsigned long deferred_ops; 12.1573 + 12.1574 + perfc_incrc(calls_to_update_va); 12.1575 + 12.1576 + if ( unlikely(!__addr_ok(va)) ) 12.1577 + return -EINVAL; 12.1578 + 12.1579 + LOCK_BIGLOCK(d); 12.1580 + 12.1581 + cleanup_writable_pagetable(d); 12.1582 + 12.1583 + /* 12.1584 + * XXX When we make this support 4MB superpages we should also deal with 12.1585 + * the case of updating L2 entries. 12.1586 + */ 12.1587 + 12.1588 + if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)], 12.1589 + mk_l1_pgentry(val))) ) 12.1590 + err = -EINVAL; 12.1591 + 12.1592 + if ( unlikely(shadow_mode(d)) ) 12.1593 + { 12.1594 + unsigned long sval = 0; 12.1595 + 12.1596 + l1pte_propagate_from_guest(d, &val, &sval); 12.1597 + 12.1598 + if ( unlikely(__put_user(sval, ((unsigned long *)( 12.1599 + &shadow_linear_pg_table[l1_linear_offset(va)])))) ) 12.1600 + { 12.1601 + /* 12.1602 + * Since L2's are guranteed RW, failure indicates the page was not 12.1603 + * shadowed, so ignore. 12.1604 + */ 12.1605 + perfc_incrc(shadow_update_va_fail); 12.1606 + } 12.1607 + 12.1608 + /* 12.1609 + * If we're in log-dirty mode then we need to note that we've updated 12.1610 + * the PTE in the PT-holding page. We need the machine frame number 12.1611 + * for this. 12.1612 + */ 12.1613 + if ( shadow_mode(d) == SHM_logdirty ) 12.1614 + mark_dirty(d, va_to_l1mfn(va)); 12.1615 + 12.1616 + check_pagetable(d, ed->arch.pagetable, "va"); /* debug */ 12.1617 + } 12.1618 + 12.1619 + deferred_ops = percpu_info[cpu].deferred_ops; 12.1620 + percpu_info[cpu].deferred_ops = 0; 12.1621 + 12.1622 + if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 12.1623 + unlikely(flags & UVMF_FLUSH_TLB) ) 12.1624 + local_flush_tlb(); 12.1625 + else if ( unlikely(flags & UVMF_INVLPG) ) 12.1626 + __flush_tlb_one(va); 12.1627 + 12.1628 + if ( unlikely(deferred_ops & DOP_RELOAD_LDT) ) 12.1629 + (void)map_ldt_shadow_page(0); 12.1630 + 12.1631 + UNLOCK_BIGLOCK(d); 12.1632 + 12.1633 + return err; 12.1634 +} 12.1635 + 12.1636 +int do_update_va_mapping_otherdomain(unsigned long va, 12.1637 + unsigned long val, 12.1638 + unsigned long flags, 12.1639 + domid_t domid) 12.1640 +{ 12.1641 + unsigned int cpu = smp_processor_id(); 12.1642 + struct domain *d; 12.1643 + int rc; 12.1644 + 12.1645 + if ( unlikely(!IS_PRIV(current->domain)) ) 12.1646 + return -EPERM; 12.1647 + 12.1648 + percpu_info[cpu].foreign = d = find_domain_by_id(domid); 12.1649 + if ( unlikely(d == NULL) ) 12.1650 + { 12.1651 + MEM_LOG("Unknown domain '%u'", domid); 12.1652 + return -ESRCH; 12.1653 + } 12.1654 + 12.1655 + rc = do_update_va_mapping(va, val, flags); 12.1656 + 12.1657 + put_domain(d); 12.1658 + percpu_info[cpu].foreign = NULL; 12.1659 + 12.1660 + return rc; 12.1661 +} 12.1662 + 12.1663 + 12.1664 + 12.1665 +/************************* 12.1666 + * Descriptor Tables 12.1667 + */ 12.1668 + 12.1669 +void destroy_gdt(struct exec_domain *ed) 12.1670 +{ 12.1671 + int i; 12.1672 + unsigned long pfn; 12.1673 + 12.1674 + for ( i = 0; i < 16; i++ ) 12.1675 + { 12.1676 + if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 ) 12.1677 + put_page_and_type(&frame_table[pfn]); 12.1678 + ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0); 12.1679 + } 12.1680 +} 12.1681 + 12.1682 + 12.1683 +long set_gdt(struct exec_domain *ed, 12.1684 + unsigned long *frames, 12.1685 + unsigned int entries) 12.1686 +{ 12.1687 + struct domain *d = ed->domain; 12.1688 + /* NB. There are 512 8-byte entries per GDT page. */ 12.1689 + int i = 0, nr_pages = (entries + 511) / 512; 12.1690 + struct desc_struct *vgdt; 12.1691 + unsigned long pfn; 12.1692 + 12.1693 + /* Check the first page in the new GDT. */ 12.1694 + if ( (pfn = frames[0]) >= max_page ) 12.1695 + goto fail; 12.1696 + 12.1697 + /* The first page is special because Xen owns a range of entries in it. */ 12.1698 + if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 12.1699 + { 12.1700 + /* GDT checks failed: try zapping the Xen reserved entries. */ 12.1701 + if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) ) 12.1702 + goto fail; 12.1703 + vgdt = map_domain_mem(pfn << PAGE_SHIFT); 12.1704 + memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0, 12.1705 + NR_RESERVED_GDT_ENTRIES*8); 12.1706 + unmap_domain_mem(vgdt); 12.1707 + put_page_and_type(&frame_table[pfn]); 12.1708 + 12.1709 + /* Okay, we zapped the entries. Now try the GDT checks again. */ 12.1710 + if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 12.1711 + goto fail; 12.1712 + } 12.1713 + 12.1714 + /* Check the remaining pages in the new GDT. */ 12.1715 + for ( i = 1; i < nr_pages; i++ ) 12.1716 + if ( ((pfn = frames[i]) >= max_page) || 12.1717 + !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) 12.1718 + goto fail; 12.1719 + 12.1720 + /* Copy reserved GDT entries to the new GDT. */ 12.1721 + vgdt = map_domain_mem(frames[0] << PAGE_SHIFT); 12.1722 + memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 12.1723 + gdt_table + FIRST_RESERVED_GDT_ENTRY, 12.1724 + NR_RESERVED_GDT_ENTRIES*8); 12.1725 + unmap_domain_mem(vgdt); 12.1726 + 12.1727 + /* Tear down the old GDT. */ 12.1728 + destroy_gdt(ed); 12.1729 + 12.1730 + /* Install the new GDT. */ 12.1731 + for ( i = 0; i < nr_pages; i++ ) 12.1732 + ed->arch.perdomain_ptes[i] = 12.1733 + mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); 12.1734 + 12.1735 + SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed)); 12.1736 + SET_GDT_ENTRIES(ed, entries); 12.1737 + 12.1738 + return 0; 12.1739 + 12.1740 + fail: 12.1741 + while ( i-- > 0 ) 12.1742 + put_page_and_type(&frame_table[frames[i]]); 12.1743 + return -EINVAL; 12.1744 +} 12.1745 + 12.1746 + 12.1747 +long do_set_gdt(unsigned long *frame_list, unsigned int entries) 12.1748 +{ 12.1749 + int nr_pages = (entries + 511) / 512; 12.1750 + unsigned long frames[16]; 12.1751 + long ret; 12.1752 + 12.1753 + if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 12.1754 + return -EINVAL; 12.1755 + 12.1756 + if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) ) 12.1757 + return -EFAULT; 12.1758 + 12.1759 + LOCK_BIGLOCK(current->domain); 12.1760 + 12.1761 + if ( (ret = set_gdt(current, frames, entries)) == 0 ) 12.1762 + { 12.1763 + local_flush_tlb(); 12.1764 + __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt)); 12.1765 + } 12.1766 + 12.1767 + UNLOCK_BIGLOCK(current->domain); 12.1768 + 12.1769 + return ret; 12.1770 +} 12.1771 + 12.1772 + 12.1773 +long do_update_descriptor( 12.1774 + unsigned long pa, unsigned long word1, unsigned long word2) 12.1775 +{ 12.1776 + unsigned long pfn = pa >> PAGE_SHIFT; 12.1777 + struct desc_struct *gdt_pent, d; 12.1778 + struct pfn_info *page; 12.1779 + struct exec_domain *ed; 12.1780 + long ret = -EINVAL; 12.1781 + 12.1782 + d.a = (u32)word1; 12.1783 + d.b = (u32)word2; 12.1784 + 12.1785 + LOCK_BIGLOCK(current->domain); 12.1786 + 12.1787 + if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) { 12.1788 + UNLOCK_BIGLOCK(current->domain); 12.1789 + return -EINVAL; 12.1790 + } 12.1791 + 12.1792 + page = &frame_table[pfn]; 12.1793 + if ( unlikely(!get_page(page, current->domain)) ) { 12.1794 + UNLOCK_BIGLOCK(current->domain); 12.1795 + return -EINVAL; 12.1796 + } 12.1797 + 12.1798 + /* Check if the given frame is in use in an unsafe context. */ 12.1799 + switch ( page->u.inuse.type_info & PGT_type_mask ) 12.1800 + { 12.1801 + case PGT_gdt_page: 12.1802 + /* Disallow updates of Xen-reserved descriptors in the current GDT. */ 12.1803 + for_each_exec_domain(current->domain, ed) { 12.1804 + if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) && 12.1805 + (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && 12.1806 + (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) 12.1807 + goto out; 12.1808 + } 12.1809 + if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) 12.1810 + goto out; 12.1811 + break; 12.1812 + case PGT_ldt_page: 12.1813 + if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) 12.1814 + goto out; 12.1815 + break; 12.1816 + default: 12.1817 + if ( unlikely(!get_page_type(page, PGT_writable_page)) ) 12.1818 + goto out; 12.1819 + break; 12.1820 + } 12.1821 + 12.1822 + /* All is good so make the update. */ 12.1823 + gdt_pent = map_domain_mem(pa); 12.1824 + memcpy(gdt_pent, &d, 8); 12.1825 + unmap_domain_mem(gdt_pent); 12.1826 + 12.1827 + put_page_type(page); 12.1828 + 12.1829 + ret = 0; /* success */ 12.1830 + 12.1831 + out: 12.1832 + put_page(page); 12.1833 + 12.1834 + UNLOCK_BIGLOCK(current->domain); 12.1835 + 12.1836 + return ret; 12.1837 +} 12.1838 + 12.1839 + 12.1840 + 12.1841 +/************************* 12.1842 + * Writable Pagetables 12.1843 + */ 12.1844 + 12.1845 +ptwr_info_t ptwr_info[NR_CPUS]; 12.1846 + 12.1847 +#ifdef VERBOSE 12.1848 +int ptwr_debug = 0x0; 12.1849 +#define PTWR_PRINTK(_f, _a...) \ 12.1850 + do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 ) 12.1851 +#define PTWR_PRINT_WHICH (which ? 'I' : 'A') 12.1852 +#else 12.1853 +#define PTWR_PRINTK(_f, _a...) ((void)0) 12.1854 +#endif 12.1855 + 12.1856 +/* Flush the given writable p.t. page and write-protect it again. */ 12.1857 +void ptwr_flush(const int which) 12.1858 +{ 12.1859 + unsigned long sstat, spte, pte, *ptep, l1va; 12.1860 + l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e; 12.1861 + l2_pgentry_t *pl2e; 12.1862 + int i, cpu = smp_processor_id(); 12.1863 + struct exec_domain *ed = current; 12.1864 + struct domain *d = ed->domain; 12.1865 + 12.1866 + l1va = ptwr_info[cpu].ptinfo[which].l1va; 12.1867 + ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT]; 12.1868 + 12.1869 + /* 12.1870 + * STEP 1. Write-protect the p.t. page so no more updates can occur. 12.1871 + */ 12.1872 + 12.1873 + if ( unlikely(__get_user(pte, ptep)) ) 12.1874 + { 12.1875 + MEM_LOG("ptwr: Could not read pte at %p\n", ptep); 12.1876 + /* 12.1877 + * Really a bug. We could read this PTE during the initial fault, 12.1878 + * and pagetables can't have changed meantime. XXX Multi-CPU guests? 12.1879 + */ 12.1880 + BUG(); 12.1881 + } 12.1882 + PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n", 12.1883 + PTWR_PRINT_WHICH, ptep, pte); 12.1884 + pte &= ~_PAGE_RW; 12.1885 + 12.1886 + if ( unlikely(shadow_mode(d)) ) 12.1887 + { 12.1888 + /* Write-protect the p.t. page in the shadow page table. */ 12.1889 + l1pte_propagate_from_guest(d, &pte, &spte); 12.1890 + __put_user( 12.1891 + spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]); 12.1892 + 12.1893 + /* Is the p.t. page itself shadowed? Map it into Xen space if so. */ 12.1894 + sstat = get_shadow_status(d, pte >> PAGE_SHIFT); 12.1895 + if ( sstat & PSH_shadowed ) 12.1896 + sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT); 12.1897 + } 12.1898 + 12.1899 + /* Write-protect the p.t. page in the guest page table. */ 12.1900 + if ( unlikely(__put_user(pte, ptep)) ) 12.1901 + { 12.1902 + MEM_LOG("ptwr: Could not update pte at %p\n", ptep); 12.1903 + /* 12.1904 + * Really a bug. We could write this PTE during the initial fault, 12.1905 + * and pagetables can't have changed meantime. XXX Multi-CPU guests? 12.1906 + */ 12.1907 + BUG(); 12.1908 + } 12.1909 + 12.1910 + /* Ensure that there are no stale writable mappings in any TLB. */ 12.1911 + /* NB. INVLPG is a serialising instruction: flushes pending updates. */ 12.1912 +#if 1 12.1913 + __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */ 12.1914 +#else 12.1915 + flush_tlb_all(); 12.1916 +#endif 12.1917 + PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n", 12.1918 + PTWR_PRINT_WHICH, ptep, pte); 12.1919 + 12.1920 + /* 12.1921 + * STEP 2. Validate any modified PTEs. 12.1922 + */ 12.1923 + 12.1924 + pl1e = ptwr_info[cpu].ptinfo[which].pl1e; 12.1925 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 12.1926 + { 12.1927 + ol1e = ptwr_info[cpu].ptinfo[which].page[i]; 12.1928 + nl1e = pl1e[i]; 12.1929 + 12.1930 + if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) ) 12.1931 + continue; 12.1932 + 12.1933 + /* 12.1934 + * Fast path for PTEs that have merely been write-protected 12.1935 + * (e.g., during a Unix fork()). A strict reduction in privilege. 12.1936 + */ 12.1937 + if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) ) 12.1938 + { 12.1939 + if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) ) 12.1940 + { 12.1941 + if ( unlikely(sl1e != NULL) ) 12.1942 + l1pte_propagate_from_guest( 12.1943 + d, &l1_pgentry_val(nl1e), 12.1944 + &l1_pgentry_val(sl1e[i])); 12.1945 + put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]); 12.1946 + } 12.1947 + continue; 12.1948 + } 12.1949 + 12.1950 + if ( unlikely(!get_page_from_l1e(nl1e, d)) ) 12.1951 + { 12.1952 + MEM_LOG("ptwr: Could not re-validate l1 page\n"); 12.1953 + /* 12.1954 + * Make the remaining p.t's consistent before crashing, so the 12.1955 + * reference counts are correct. 12.1956 + */ 12.1957 + memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i], 12.1958 + (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t)); 12.1959 + unmap_domain_mem(pl1e); 12.1960 + ptwr_info[cpu].ptinfo[which].l1va = 0; 12.1961 + UNLOCK_BIGLOCK(d); 12.1962 + domain_crash(); 12.1963 + } 12.1964 + 12.1965 + if ( unlikely(sl1e != NULL) ) 12.1966 + l1pte_propagate_from_guest( 12.1967 + d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i])); 12.1968 + 12.1969 + if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) ) 12.1970 + put_page_from_l1e(ol1e, d); 12.1971 + } 12.1972 + unmap_domain_mem(pl1e); 12.1973 + 12.1974 + /* 12.1975 + * STEP 3. Reattach the L1 p.t. page into the current address space. 12.1976 + */ 12.1977 + 12.1978 + if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) ) 12.1979 + { 12.1980 + pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx]; 12.1981 + *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 12.1982 + } 12.1983 + 12.1984 + /* 12.1985 + * STEP 4. Final tidy-up. 12.1986 + */ 12.1987 + 12.1988 + ptwr_info[cpu].ptinfo[which].l1va = 0; 12.1989 + 12.1990 + if ( unlikely(sl1e != NULL) ) 12.1991 + { 12.1992 + unmap_domain_mem(sl1e); 12.1993 + put_shadow_status(d); 12.1994 + } 12.1995 +} 12.1996 + 12.1997 +/* Write page fault handler: check if guest is trying to modify a PTE. */ 12.1998 +int ptwr_do_page_fault(unsigned long addr) 12.1999 +{ 12.2000 + unsigned long pte, pfn, l2e; 12.2001 + struct pfn_info *page; 12.2002 + l2_pgentry_t *pl2e; 12.2003 + int which, cpu = smp_processor_id(); 12.2004 + u32 l2_idx; 12.2005 + 12.2006 +#ifdef __x86_64__ 12.2007 + return 0; /* Writable pagetables need fixing for x86_64. */ 12.2008 +#endif 12.2009 + 12.2010 + /* 12.2011 + * Attempt to read the PTE that maps the VA being accessed. By checking for 12.2012 + * PDE validity in the L2 we avoid many expensive fixups in __get_user(). 12.2013 + */ 12.2014 + if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) & 12.2015 + _PAGE_PRESENT) || 12.2016 + __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) ) 12.2017 + { 12.2018 + return 0; 12.2019 + } 12.2020 + 12.2021 + pfn = pte >> PAGE_SHIFT; 12.2022 + page = &frame_table[pfn]; 12.2023 + 12.2024 + /* We are looking only for read-only mappings of p.t. pages. */ 12.2025 + if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) || 12.2026 + ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ) 12.2027 + { 12.2028 + return 0; 12.2029 + } 12.2030 + 12.2031 + /* Get the L2 index at which this L1 p.t. is always mapped. */ 12.2032 + l2_idx = page->u.inuse.type_info & PGT_va_mask; 12.2033 + if ( unlikely(l2_idx >= PGT_va_unknown) ) 12.2034 + { 12.2035 + domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */ 12.2036 + } 12.2037 + l2_idx >>= PGT_va_shift; 12.2038 + 12.2039 + if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) ) 12.2040 + { 12.2041 + MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr); 12.2042 + domain_crash(); 12.2043 + } 12.2044 + 12.2045 + /* 12.2046 + * Is the L1 p.t. mapped into the current address space? If so we call it 12.2047 + * an ACTIVE p.t., otherwise it is INACTIVE. 12.2048 + */ 12.2049 + pl2e = &linear_l2_table[l2_idx]; 12.2050 + l2e = l2_pgentry_val(*pl2e); 12.2051 + which = PTWR_PT_INACTIVE; 12.2052 + if ( (l2e >> PAGE_SHIFT) == pfn ) 12.2053 + { 12.2054 + /* Check the PRESENT bit to set ACTIVE. */ 12.2055 + if ( likely(l2e & _PAGE_PRESENT) ) 12.2056 + which = PTWR_PT_ACTIVE; 12.2057 + else { 12.2058 + /* 12.2059 + * If the PRESENT bit is clear, we may be conflicting with 12.2060 + * the current ACTIVE p.t. (it may be the same p.t. mapped 12.2061 + * at another virt addr). 12.2062 + * The ptwr_flush call below will restore the PRESENT bit. 12.2063 + */ 12.2064 + if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va && 12.2065 + l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx ) 12.2066 + which = PTWR_PT_ACTIVE; 12.2067 + } 12.2068 + } 12.2069 + 12.2070 + PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, " 12.2071 + "pfn %08lx\n", PTWR_PRINT_WHICH, 12.2072 + addr, l2_idx << L2_PAGETABLE_SHIFT, pfn); 12.2073 + 12.2074 + /* 12.2075 + * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 12.2076 + * time. If there is already one, we must flush it out. 12.2077 + */ 12.2078 + if ( ptwr_info[cpu].ptinfo[which].l1va ) 12.2079 + ptwr_flush(which); 12.2080 + 12.2081 + ptwr_info[cpu].ptinfo[which].l1va = addr | 1; 12.2082 + ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx; 12.2083 + 12.2084 + /* For safety, disconnect the L1 p.t. page from current space. */ 12.2085 + if ( (which == PTWR_PT_ACTIVE) && 12.2086 + likely(!shadow_mode(current->domain)) ) 12.2087 + { 12.2088 + *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT); 12.2089 +#if 1 12.2090 + flush_tlb(); /* XXX Multi-CPU guests? */ 12.2091 +#else 12.2092 + flush_tlb_all(); 12.2093 +#endif 12.2094 + } 12.2095 + 12.2096 + /* Temporarily map the L1 page, and make a copy of it. */ 12.2097 + ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT); 12.2098 + memcpy(ptwr_info[cpu].ptinfo[which].page, 12.2099 + ptwr_info[cpu].ptinfo[which].pl1e, 12.2100 + ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t)); 12.2101 + 12.2102 + /* Finally, make the p.t. page writable by the guest OS. */ 12.2103 + pte |= _PAGE_RW; 12.2104 + PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH, 12.2105 + &linear_pg_table[addr>>PAGE_SHIFT], pte); 12.2106 + if ( unlikely(__put_user(pte, (unsigned long *) 12.2107 + &linear_pg_table[addr>>PAGE_SHIFT])) ) 12.2108 + { 12.2109 + MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *) 12.2110 + &linear_pg_table[addr>>PAGE_SHIFT]); 12.2111 + /* Toss the writable pagetable state and crash. */ 12.2112 + unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e); 12.2113 + ptwr_info[cpu].ptinfo[which].l1va = 0; 12.2114 + domain_crash(); 12.2115 + } 12.2116 + 12.2117 + return EXCRET_fault_fixed; 12.2118 +} 12.2119 + 12.2120 +static __init int ptwr_init(void) 12.2121 +{ 12.2122 + int i; 12.2123 + 12.2124 + for ( i = 0; i < smp_num_cpus; i++ ) 12.2125 + { 12.2126 + ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page = 12.2127 + (void *)alloc_xenheap_page(); 12.2128 + ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page = 12.2129 + (void *)alloc_xenheap_page(); 12.2130 + } 12.2131 + 12.2132 + return 0; 12.2133 +} 12.2134 +__initcall(ptwr_init); 12.2135 + 12.2136 + 12.2137 + 12.2138 + 12.2139 +/************************************************************************/ 12.2140 +/************************************************************************/ 12.2141 +/************************************************************************/ 12.2142 + 12.2143 +#ifndef NDEBUG 12.2144 + 12.2145 +void ptwr_status(void) 12.2146 +{ 12.2147 + unsigned long pte, *ptep, pfn; 12.2148 + struct pfn_info *page; 12.2149 + int cpu = smp_processor_id(); 12.2150 + 12.2151 + ptep = (unsigned long *)&linear_pg_table 12.2152 + [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT]; 12.2153 + 12.2154 + if ( __get_user(pte, ptep) ) { 12.2155 + MEM_LOG("ptwr: Could not read pte at %p\n", ptep); 12.2156 + domain_crash(); 12.2157 + } 12.2158 + 12.2159 + pfn = pte >> PAGE_SHIFT; 12.2160 + page = &frame_table[pfn]; 12.2161 + printk("need to alloc l1 page %p\n", page); 12.2162 + /* make pt page writable */ 12.2163 + printk("need to make read-only l1-page at %p is %08lx\n", 12.2164 + ptep, pte); 12.2165 + 12.2166 + if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 ) 12.2167 + return; 12.2168 + 12.2169 + if ( __get_user(pte, (unsigned long *) 12.2170 + ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) { 12.2171 + MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *) 12.2172 + ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va); 12.2173 + domain_crash(); 12.2174 + } 12.2175 + pfn = pte >> PAGE_SHIFT; 12.2176 + page = &frame_table[pfn]; 12.2177 +} 12.2178 + 12.2179 +void audit_domain(struct domain *d) 12.2180 +{ 12.2181 + int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; 12.2182 + 12.2183 + void adjust (struct pfn_info *page, int dir, int adjtype) 12.2184 + { 12.2185 + int count = page->count_info & PGC_count_mask; 12.2186 + 12.2187 + if ( adjtype ) 12.2188 + { 12.2189 + int tcount = page->u.inuse.type_info & PGT_count_mask; 12.2190 + 12.2191 + ttot++; 12.2192 + 12.2193 + tcount += dir; 12.2194 + 12.2195 + if ( tcount < 0 ) 12.2196 + { 12.2197 + /* This will only come out once. */ 12.2198 + printk("Audit %d: type count whent below zero pfn=%x " 12.2199 + "taf=%x otaf=%x\n", 12.2200 + d->id, page-frame_table, 12.2201 + page->u.inuse.type_info, 12.2202 + page->tlbflush_timestamp); 12.2203 + } 12.2204 + 12.2205 + page->u.inuse.type_info = 12.2206 + (page->u.inuse.type_info & ~PGT_count_mask) | 12.2207 + (tcount & PGT_count_mask); 12.2208 + } 12.2209 + 12.2210 + ctot++; 12.2211 + count += dir; 12.2212 + if ( count < 0 ) 12.2213 + { 12.2214 + /* This will only come out once. */ 12.2215 + printk("Audit %d: general count whent below zero pfn=%x " 12.2216 + "taf=%x otaf=%x\n", 12.2217 + d->id, page-frame_table, 12.2218 + page->u.inuse.type_info, 12.2219 + page->tlbflush_timestamp); 12.2220 + } 12.2221 + 12.2222 + page->count_info = 12.2223 + (page->count_info & ~PGC_count_mask) | 12.2224 + (count & PGC_count_mask); 12.2225 + 12.2226 + } 12.2227 + 12.2228 + void scan_for_pfn(struct domain *d, unsigned long xpfn) 12.2229 + { 12.2230 + unsigned long pfn, *pt; 12.2231 + struct list_head *list_ent; 12.2232 + struct pfn_info *page; 12.2233 + int i; 12.2234 + 12.2235 + list_ent = d->page_list.next; 12.2236 + for ( i = 0; (list_ent != &d->page_list); i++ ) 12.2237 + { 12.2238 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 12.2239 + page = &frame_table[pfn]; 12.2240 + 12.2241 + switch ( page->u.inuse.type_info & PGT_type_mask ) 12.2242 + { 12.2243 + case PGT_l1_page_table: 12.2244 + case PGT_l2_page_table: 12.2245 + pt = map_domain_mem(pfn<<PAGE_SHIFT); 12.2246 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 12.2247 + if ( (pt[i] & _PAGE_PRESENT) && 12.2248 + ((pt[i] >> PAGE_SHIFT) == xpfn) ) 12.2249 + printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n", 12.2250 + d->id, i, pfn, page->u.inuse.type_info, 12.2251 + page->count_info); 12.2252 + unmap_domain_mem(pt); 12.2253 + } 12.2254 + 12.2255 + list_ent = frame_table[pfn].list.next; 12.2256 + } 12.2257 + 12.2258 + } 12.2259 + 12.2260 + void scan_for_pfn_remote(unsigned long xpfn) 12.2261 + { 12.2262 + struct domain *e; 12.2263 + for_each_domain ( e ) 12.2264 + scan_for_pfn( e, xpfn ); 12.2265 + } 12.2266 + 12.2267 + int i; 12.2268 + unsigned long pfn; 12.2269 + struct list_head *list_ent; 12.2270 + struct pfn_info *page; 12.2271 + 12.2272 + if ( d != current->domain ) 12.2273 + domain_pause(d); 12.2274 + synchronise_pagetables(~0UL); 12.2275 + 12.2276 + printk("pt base=%lx sh_info=%x\n", 12.2277 + pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT, 12.2278 + virt_to_page(d->shared_info)-frame_table); 12.2279 + 12.2280 + spin_lock(&d->page_alloc_lock); 12.2281 + 12.2282 + /* PHASE 0 */ 12.2283 + 12.2284 + list_ent = d->page_list.next; 12.2285 + for ( i = 0; (list_ent != &d->page_list); i++ ) 12.2286 + { 12.2287 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 12.2288 + page = &frame_table[pfn]; 12.2289 + 12.2290 + if ( page_get_owner(page) != d ) 12.2291 + BUG(); 12.2292 + 12.2293 + if ( (page->u.inuse.type_info & PGT_count_mask) > 12.2294 + (page->count_info & PGC_count_mask) ) 12.2295 + printk("taf > caf %x %x pfn=%lx\n", 12.2296 + page->u.inuse.type_info, page->count_info, pfn ); 12.2297 + 12.2298 +#if 0 /* SYSV shared memory pages plus writeable files. */ 12.2299 + if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 12.2300 + (page->u.inuse.type_info & PGT_count_mask) > 1 ) 12.2301 + { 12.2302 + printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n", 12.2303 + pfn, 12.2304 + page->u.inuse.type_info, 12.2305 + page->count_info ); 12.2306 + scan_for_pfn_remote(pfn); 12.2307 + } 12.2308 +#endif 12.2309 + if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 12.2310 + (page->u.inuse.type_info & PGT_count_mask) > 1 ) 12.2311 + { 12.2312 + printk("normal page with type count >1: pfn=%lx t=%x c=%x\n", 12.2313 + pfn, 12.2314 + page->u.inuse.type_info, 12.2315 + page->count_info ); 12.2316 + } 12.2317 + 12.2318 + /* Use tlbflush_timestamp to store original type_info. */ 12.2319 + page->tlbflush_timestamp = page->u.inuse.type_info; 12.2320 + 12.2321 + list_ent = frame_table[pfn].list.next; 12.2322 + } 12.2323 + 12.2324 + 12.2325 + /* PHASE 1 */ 12.2326 + 12.2327 + adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1); 12.2328 + 12.2329 + list_ent = d->page_list.next; 12.2330 + for ( i = 0; (list_ent != &d->page_list); i++ ) 12.2331 + { 12.2332 + unsigned long *pt; 12.2333 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 12.2334 + page = &frame_table[pfn]; 12.2335 + 12.2336 + if ( page_get_owner(page) != d ) 12.2337 + BUG(); 12.2338 + 12.2339 + switch ( page->u.inuse.type_info & PGT_type_mask ) 12.2340 + { 12.2341 + case PGT_l2_page_table: 12.2342 + 12.2343 + if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) 12.2344 + printk("Audit %d: L2 not validated %x\n", 12.2345 + d->id, page->u.inuse.type_info); 12.2346 + 12.2347 + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) 12.2348 + printk("Audit %d: L2 not pinned %x\n", 12.2349 + d->id, page->u.inuse.type_info); 12.2350 + else 12.2351 + adjust( page, -1, 1 ); 12.2352 + 12.2353 + pt = map_domain_mem( pfn<<PAGE_SHIFT ); 12.2354 + 12.2355 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 12.2356 + { 12.2357 + if ( pt[i] & _PAGE_PRESENT ) 12.2358 + { 12.2359 + unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 12.2360 + struct pfn_info *l1page = &frame_table[l1pfn]; 12.2361 + 12.2362 + if ( page_get_owner(l1page) != d ) 12.2363 + { 12.2364 + printk("L2: Skip bizarre page belonging to other " 12.2365 + "dom %p\n", page_get_owner(l1page)); 12.2366 + continue; 12.2367 + } 12.2368 + 12.2369 + if ( (l1page->u.inuse.type_info & PGT_type_mask) == 12.2370 + PGT_l2_page_table ) 12.2371 + printk("Audit %d: [%x] Found %s Linear PT " 12.2372 + "t=%x pfn=%lx\n", d->id, i, 12.2373 + (l1pfn==pfn) ? "Self" : "Other", 12.2374 + l1page->u.inuse.type_info, 12.2375 + l1pfn); 12.2376 + else if ( (l1page->u.inuse.type_info & PGT_type_mask) != 12.2377 + PGT_l1_page_table ) 12.2378 + printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n", 12.2379 + d->id, i, 12.2380 + l1page->u.inuse.type_info, 12.2381 + l1pfn); 12.2382 + 12.2383 + adjust(l1page, -1, 1); 12.2384 + } 12.2385 + } 12.2386 + 12.2387 + unmap_domain_mem(pt); 12.2388 + 12.2389 + break; 12.2390 + 12.2391 + 12.2392 + case PGT_l1_page_table: 12.2393 + 12.2394 + if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 12.2395 + adjust( page, -1, 1 ); 12.2396 + 12.2397 + if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) 12.2398 + printk("Audit %d: L1 not validated %x\n", 12.2399 + d->id, page->u.inuse.type_info); 12.2400 +#if 0 12.2401 + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) 12.2402 + printk("Audit %d: L1 not pinned %x\n", 12.2403 + d->id, page->u.inuse.type_info); 12.2404 +#endif 12.2405 + pt = map_domain_mem( pfn<<PAGE_SHIFT ); 12.2406 + 12.2407 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 12.2408 + { 12.2409 + if ( pt[i] & _PAGE_PRESENT ) 12.2410 + { 12.2411 + unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 12.2412 + struct pfn_info *l1page = &frame_table[l1pfn]; 12.2413 + 12.2414 + if ( l1pfn < 0x100 ) 12.2415 + { 12.2416 + lowmem_mappings++; 12.2417 + continue; 12.2418 + } 12.2419 + 12.2420 + if ( l1pfn > max_page ) 12.2421 + { 12.2422 + io_mappings++; 12.2423 + continue; 12.2424 + } 12.2425 + 12.2426 + if ( pt[i] & _PAGE_RW ) 12.2427 + { 12.2428 + 12.2429 + if ( (l1page->u.inuse.type_info & PGT_type_mask) == 12.2430 + PGT_l1_page_table || 12.2431 + (l1page->u.inuse.type_info & PGT_type_mask) == 12.2432 + PGT_l2_page_table ) 12.2433 + printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n", 12.2434 + d->id, i, 12.2435 + l1page->u.inuse.type_info, 12.2436 + l1pfn); 12.2437 + 12.2438 + } 12.2439 + 12.2440 + if ( page_get_owner(l1page) != d ) 12.2441 + { 12.2442 + printk("Audit %d: [%lx,%x] Skip foreign page dom=%p " 12.2443 + "pfn=%lx c=%08x t=%08x m2p=%lx\n", 12.2444 + d->id, pfn, i, 12.2445 + page_get_owner(l1page), 12.2446 + l1pfn, 12.2447 + l1page->count_info, 12.2448 + l1page->u.inuse.type_info, 12.2449 + machine_to_phys_mapping[l1pfn]); 12.2450 + continue; 12.2451 + } 12.2452 + 12.2453 + adjust(l1page, -1, 0); 12.2454 + } 12.2455 + } 12.2456 + 12.2457 + unmap_domain_mem(pt); 12.2458 + 12.2459 + break; 12.2460 + } 12.2461 + 12.2462 + list_ent = frame_table[pfn].list.next; 12.2463 + } 12.2464 + 12.2465 + if ( (io_mappings > 0) || (lowmem_mappings > 0) ) 12.2466 + printk("Audit %d: Found %d lowmem mappings and %d io mappings\n", 12.2467 + d->id, lowmem_mappings, io_mappings); 12.2468 + 12.2469 + /* PHASE 2 */ 12.2470 + 12.2471 + ctot = ttot = 0; 12.2472 + list_ent = d->page_list.next; 12.2473 + for ( i = 0; (list_ent != &d->page_list); i++ ) 12.2474 + { 12.2475 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 12.2476 + page = &frame_table[pfn]; 12.2477 + 12.2478 + switch ( page->u.inuse.type_info & PGT_type_mask) 12.2479 + { 12.2480 + case PGT_l1_page_table: 12.2481 + case PGT_l2_page_table: 12.2482 + if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) 12.2483 + { 12.2484 + printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n", 12.2485 + d->id, page->u.inuse.type_info, 12.2486 + page->tlbflush_timestamp, 12.2487 + page->count_info, pfn ); 12.2488 + scan_for_pfn_remote(pfn); 12.2489 + } 12.2490 + default: 12.2491 + if ( (page->count_info & PGC_count_mask) != 1 ) 12.2492 + { 12.2493 + printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n", 12.2494 + d->id, 12.2495 + page->count_info, 12.2496 + page->u.inuse.type_info, 12.2497 + page->tlbflush_timestamp, pfn ); 12.2498 + scan_for_pfn_remote(pfn); 12.2499 + } 12.2500 + break; 12.2501 + } 12.2502 + 12.2503 + list_ent = frame_table[pfn].list.next; 12.2504 + } 12.2505 + 12.2506 + /* PHASE 3 */ 12.2507 + list_ent = d->page_list.next; 12.2508 + for ( i = 0; (list_ent != &d->page_list); i++ ) 12.2509 + { 12.2510 + unsigned long *pt; 12.2511 + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 12.2512 + page = &frame_table[pfn]; 12.2513 + 12.2514 + switch ( page->u.inuse.type_info & PGT_type_mask ) 12.2515 + { 12.2516 + case PGT_l2_page_table: 12.2517 + if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 12.2518 + adjust( page, 1, 1 ); 12.2519 + 12.2520 + pt = map_domain_mem( pfn<<PAGE_SHIFT ); 12.2521 + 12.2522 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 12.2523 + { 12.2524 + if ( pt[i] & _PAGE_PRESENT ) 12.2525 + { 12.2526 + unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 12.2527 + struct pfn_info *l1page; 12.2528 + 12.2529 + if (l1pfn>max_page) 12.2530 + continue; 12.2531 + 12.2532 + l1page = &frame_table[l1pfn]; 12.2533 + 12.2534 + if ( page_get_owner(l1page) == d ) 12.2535 + adjust(l1page, 1, 1); 12.2536 + } 12.2537 + } 12.2538 + 12.2539 + unmap_domain_mem(pt); 12.2540 + break; 12.2541 + 12.2542 + case PGT_l1_page_table: 12.2543 + if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) 12.2544 + adjust( page, 1, 1 ); 12.2545 + 12.2546 + pt = map_domain_mem( pfn<<PAGE_SHIFT ); 12.2547 + 12.2548 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 12.2549 + { 12.2550 + if ( pt[i] & _PAGE_PRESENT ) 12.2551 + { 12.2552 + unsigned long l1pfn = pt[i]>>PAGE_SHIFT; 12.2553 + struct pfn_info *l1page; 12.2554 + 12.2555 + if (l1pfn>max_page) 12.2556 + continue; 12.2557 + 12.2558 + l1page = &frame_table[l1pfn]; 12.2559 + 12.2560 + if ( (page_get_owner(l1page) != d) || 12.2561 + (l1pfn < 0x100) || (l1pfn > max_page) ) 12.2562 + continue; 12.2563 + 12.2564 + adjust(l1page, 1, 0); 12.2565 + } 12.2566 + } 12.2567 + 12.2568 + unmap_domain_mem(pt); 12.2569 + break; 12.2570 + } 12.2571 + 12.2572 + 12.2573 + page->tlbflush_timestamp = 0; 12.2574 + 12.2575 + list_ent = frame_table[pfn].list.next; 12.2576 + } 12.2577 + 12.2578 + spin_unlock(&d->page_alloc_lock); 12.2579 + 12.2580 + adjust(&frame_table[pagetable_val( 12.2581 + d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1); 12.2582 + 12.2583 + printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot ); 12.2584 + 12.2585 + if ( d != current->domain ) 12.2586 + domain_unpause(d); 12.2587 +} 12.2588 + 12.2589 +void audit_domains(void) 12.2590 +{ 12.2591 + struct domain *d; 12.2592 + for_each_domain ( d ) 12.2593 + audit_domain(d); 12.2594 +} 12.2595 + 12.2596 +void audit_domains_key(unsigned char key) 12.2597 +{ 12.2598 + audit_domains(); 12.2599 +} 12.2600 + 12.2601 +#endif
13.1 --- a/xen/arch/x86/traps.c Tue Feb 08 12:27:23 2005 +0000 13.2 +++ b/xen/arch/x86/traps.c Tue Feb 08 15:13:51 2005 +0000 13.3 @@ -528,7 +528,7 @@ asmlinkage int do_general_protection(str 13.4 13.5 /* Emulate some simple privileged instructions when exec'ed in ring 1. */ 13.6 if ( (regs->error_code == 0) && 13.7 - RING_1(regs) && 13.8 + GUESTOS_FAULT(regs) && 13.9 emulate_privileged_op(regs) ) 13.10 return 0; 13.11
14.1 --- a/xen/common/dom_mem_ops.c Tue Feb 08 12:27:23 2005 +0000 14.2 +++ b/xen/common/dom_mem_ops.c Tue Feb 08 15:13:51 2005 +0000 14.3 @@ -122,7 +122,7 @@ free_dom_mem(struct domain *d, 14.4 long 14.5 do_dom_mem_op(unsigned long op, 14.6 unsigned long *extent_list, 14.7 - unsigned long nr_extents, 14.8 + unsigned int nr_extents, 14.9 unsigned int extent_order, 14.10 domid_t domid) 14.11 { 14.12 @@ -133,8 +133,7 @@ do_dom_mem_op(unsigned long op, 14.13 start_extent = op >> START_EXTENT_SHIFT; 14.14 op &= (1 << START_EXTENT_SHIFT) - 1; 14.15 14.16 - if ( unlikely(start_extent > nr_extents) || 14.17 - unlikely(nr_extents > ~0U) ) /* can pack into a uint? */ 14.18 + if ( unlikely(start_extent > nr_extents) ) 14.19 return -EINVAL; 14.20 14.21 if ( likely(domid == DOMID_SELF) ) 14.22 @@ -150,13 +149,11 @@ do_dom_mem_op(unsigned long op, 14.23 { 14.24 case MEMOP_increase_reservation: 14.25 rc = alloc_dom_mem( 14.26 - d, extent_list, start_extent, 14.27 - (unsigned int)nr_extents, extent_order); 14.28 + d, extent_list, start_extent, nr_extents, extent_order); 14.29 break; 14.30 case MEMOP_decrease_reservation: 14.31 rc = free_dom_mem( 14.32 - d, extent_list, start_extent, 14.33 - (unsigned int)nr_extents, extent_order); 14.34 + d, extent_list, start_extent, nr_extents, extent_order); 14.35 break; 14.36 default: 14.37 rc = -ENOSYS;
15.1 --- a/xen/include/asm-x86/page.h Tue Feb 08 12:27:23 2005 +0000 15.2 +++ b/xen/include/asm-x86/page.h Tue Feb 08 15:13:51 2005 +0000 15.3 @@ -99,6 +99,13 @@ typedef struct { unsigned long l4_lo; } 15.4 (((_a) >> L4_PAGETABLE_SHIFT) & (ENTRIES_PER_L4_PAGETABLE - 1)) 15.5 #endif 15.6 15.7 +/* Given a virtual address, get an entry offset into a linear page table. */ 15.8 +#if defined(__i386__) 15.9 +#define l1_linear_offset(_a) ((_a) >> PAGE_SHIFT) 15.10 +#elif defined(__x86_64__) 15.11 +#define l1_linear_offset(_a) (((_a) & ((1UL << 48) - 1)) >> PAGE_SHIFT) 15.12 +#endif 15.13 + 15.14 #if defined(__i386__) 15.15 #define pagetable_t l2_pgentry_t 15.16 #define pagetable_val(_x) ((_x).l2_lo)
16.1 --- a/xen/include/asm-x86/x86_32/regs.h Tue Feb 08 12:27:23 2005 +0000 16.2 +++ b/xen/include/asm-x86/x86_32/regs.h Tue Feb 08 15:13:51 2005 +0000 16.3 @@ -39,4 +39,6 @@ struct xen_regs 16.4 #define RING_2(_r) (((_r)->cs & 3) == 2) 16.5 #define RING_3(_r) (((_r)->cs & 3) == 3) 16.6 16.7 +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_1(_r)) 16.8 + 16.9 #endif
17.1 --- a/xen/include/asm-x86/x86_64/regs.h Tue Feb 08 12:27:23 2005 +0000 17.2 +++ b/xen/include/asm-x86/x86_64/regs.h Tue Feb 08 15:13:51 2005 +0000 17.3 @@ -36,4 +36,6 @@ struct xen_regs 17.4 #define RING_2(_r) (((_r)->cs & 3) == 2) 17.5 #define RING_3(_r) (((_r)->cs & 3) == 3) 17.6 17.7 +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_3(_r)) 17.8 + 17.9 #endif
18.1 --- a/xen/include/asm-x86/x86_64/uaccess.h Tue Feb 08 12:27:23 2005 +0000 18.2 +++ b/xen/include/asm-x86/x86_64/uaccess.h Tue Feb 08 15:13:51 2005 +0000 18.3 @@ -15,34 +15,19 @@ 18.4 #define VERIFY_READ 0 18.5 #define VERIFY_WRITE 1 18.6 18.7 -#define __addr_ok(addr) ((unsigned long)(addr) < HYPERVISOR_VIRT_START) 18.8 - 18.9 /* 18.10 - * Test whether a block of memory is a valid user space address. 18.11 - * Returns 0 if the range is valid, nonzero otherwise. 18.12 - * 18.13 - * This is equivalent to the following test: 18.14 - * ((u65)addr >= (u65)HYPERVISOR_VIRT_END) ? 18.15 - * (((u65)addr + (u65)size) >= ((u65)1 << 64)) : 18.16 - * (((u65)addr + (u65)size) >= ((u65)HYPERVISOR_VIRT_START)) 18.17 + * Valid if in +ve half of 48-bit address space, or above Xen-reserved area. 18.18 + * This is also valid for range checks (addr, addr+size). As long as the 18.19 + * start address is outside the Xen-reserved area then we will access a 18.20 + * non-canonical address (and thus fault) before ever reaching VIRT_START. 18.21 */ 18.22 -#define __range_not_ok(addr,size) ({ \ 18.23 - unsigned long flag,sum; \ 18.24 - if ((unsigned long)addr >= HYPERVISOR_VIRT_END) \ 18.25 - asm("addq %3,%1 ; sbbq %0,%0" \ 18.26 - :"=&r" (flag), "=r" (sum) \ 18.27 - :"1" (addr),"g" ((long)(size))); \ 18.28 - else \ 18.29 - asm("addq %3,%1 ; sbbq %0,%0 ; cmpq %1,%4 ; sbbq $0,%0" \ 18.30 - :"=&r" (flag), "=r" (sum) \ 18.31 - :"1" (addr),"g" ((long)(size)),"r" (HYPERVISOR_VIRT_START)); \ 18.32 - flag; }) 18.33 +#define __addr_ok(addr) \ 18.34 + (((unsigned long)(addr) < (1UL<<48)) || \ 18.35 + ((unsigned long)(addr) >= HYPERVISOR_VIRT_END)) 18.36 18.37 -#define access_ok(type, addr, size) (__range_not_ok(addr,size) == 0) 18.38 +#define access_ok(type, addr, size) (__addr_ok(addr)) 18.39 18.40 -#define array_access_ok(type,addr,count,size) \ 18.41 - (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ && \ 18.42 - access_ok(type,addr,(unsigned long)count*(unsigned long)size)) 18.43 +#define array_access_ok(type,addr,count,size) (__addr_ok(addr)) 18.44 18.45 extern long __get_user_bad(void); 18.46 extern void __put_user_bad(void);