# HG changeset patch # User kaf24@scramble.cl.cam.ac.uk # Date 1107875631 0 # Node ID 9f7935ea4606adeac33df05e402210015686058e # Parent ea98f0bb6510fd8e2f5c02839d09b31b27d3f735# Parent f504382b179f1957843ccd39ab959abf62359275 bitkeeper revision 1.1159.212.128 (4208d72fZEHIE9NOZZbr91V7R-3gUg) Merge scramble.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk into scramble.cl.cam.ac.uk:/local/scratch/kaf24/xen-unstable.bk diff -r ea98f0bb6510 -r 9f7935ea4606 .rootkeys --- a/.rootkeys Tue Feb 08 12:27:23 2005 +0000 +++ b/.rootkeys Tue Feb 08 15:13:51 2005 +0000 @@ -867,8 +867,8 @@ 3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/ 3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/x86/idle0_task.c 3ddb79bcKIkRR0kqWaJhe5VUDkMdxg xen/arch/x86/io_apic.c 3ddb79bdqfIcjkz_h9Hvtp8Tk_19Zw xen/arch/x86/irq.c -40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/memory.c 41d54a76qfpO0VnbL2tYs0Jgt3W3XA xen/arch/x86/microcode.c +40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/mm.c 3ddb79bdS4UeWWXDH-FaBKqcpMFcnw xen/arch/x86/mpparse.c 41aaf566Z4sTDgJ77eEg0TzzQ1ka6Q xen/arch/x86/mtrr/amd.c 41aaf566TOpOBXT00wwQGUh20f1rlA xen/arch/x86/mtrr/centaur.c diff -r ea98f0bb6510 -r 9f7935ea4606 linux-2.4.29-xen-sparse/mm/memory.c --- a/linux-2.4.29-xen-sparse/mm/memory.c Tue Feb 08 12:27:23 2005 +0000 +++ b/linux-2.4.29-xen-sparse/mm/memory.c Tue Feb 08 15:13:51 2005 +0000 @@ -915,7 +915,7 @@ static inline void establish_pte(struct #ifdef CONFIG_XEN if ( likely(vma->vm_mm == current->mm) ) { XEN_flush_page_update_queue(); - HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, UVMF_INVLPG); + HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG); } else { set_pte(page_table, entry); flush_tlb_page(vma, address); @@ -1191,7 +1191,7 @@ static int do_swap_page(struct mm_struct #ifdef CONFIG_XEN if ( likely(vma->vm_mm == current->mm) ) { XEN_flush_page_update_queue(); - HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, pte, 0); + HYPERVISOR_update_va_mapping(address, pte, 0); } else { set_pte(page_table, pte); XEN_flush_page_update_queue(); @@ -1247,7 +1247,7 @@ static int do_anonymous_page(struct mm_s #ifdef CONFIG_XEN if ( likely(vma->vm_mm == current->mm) ) { XEN_flush_page_update_queue(); - HYPERVISOR_update_va_mapping(addr>>PAGE_SHIFT, entry, 0); + HYPERVISOR_update_va_mapping(addr, entry, 0); } else { set_pte(page_table, entry); XEN_flush_page_update_queue(); @@ -1333,7 +1333,7 @@ static int do_no_page(struct mm_struct * #ifdef CONFIG_XEN if ( likely(vma->vm_mm == current->mm) ) { XEN_flush_page_update_queue(); - HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, 0); + HYPERVISOR_update_va_mapping(address, entry, 0); } else { set_pte(page_table, entry); XEN_flush_page_update_queue(); diff -r ea98f0bb6510 -r 9f7935ea4606 linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c --- a/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c Tue Feb 08 12:27:23 2005 +0000 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c Tue Feb 08 15:13:51 2005 +0000 @@ -95,7 +95,7 @@ static void fast_flush_area(int idx, int for ( i = 0; i < nr_pages; i++ ) { mcl[i].op = __HYPERVISOR_update_va_mapping; - mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT; + mcl[i].args[0] = MMAP_VADDR(idx, i); mcl[i].args[1] = 0; mcl[i].args[2] = 0; } @@ -343,14 +343,14 @@ static void dispatch_probe(blkif_t *blki #ifdef CONFIG_XEN_BLKDEV_TAP_BE if ( HYPERVISOR_update_va_mapping_otherdomain( - MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT, + MMAP_VADDR(pending_idx, 0), (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, 0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) ) goto out; #else if ( HYPERVISOR_update_va_mapping_otherdomain( - MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT, + MMAP_VADDR(pending_idx, 0), (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, 0, blkif->domid) ) @@ -436,7 +436,7 @@ static void dispatch_rw_block_io(blkif_t for ( i = 0; i < nr_psegs; i++ ) { mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; - mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; + mcl[i].args[0] = MMAP_VADDR(pending_idx, i); mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot; mcl[i].args[2] = 0; #ifdef CONFIG_XEN_BLKDEV_TAP_BE diff -r ea98f0bb6510 -r 9f7935ea4606 linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c --- a/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c Tue Feb 08 12:27:23 2005 +0000 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c Tue Feb 08 15:13:51 2005 +0000 @@ -234,7 +234,7 @@ static void net_rx_action(unsigned long mmu[2].val = MMUEXT_REASSIGN_PAGE; mcl[0].op = __HYPERVISOR_update_va_mapping; - mcl[0].args[0] = vdata >> PAGE_SHIFT; + mcl[0].args[0] = vdata; mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL; mcl[0].args[2] = 0; mcl[1].op = __HYPERVISOR_mmu_update; @@ -409,7 +409,7 @@ static void net_tx_action(unsigned long { pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; mcl[0].op = __HYPERVISOR_update_va_mapping; - mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; + mcl[0].args[0] = MMAP_VADDR(pending_idx); mcl[0].args[1] = 0; mcl[0].args[2] = 0; mcl++; @@ -546,7 +546,7 @@ static void net_tx_action(unsigned long skb_reserve(skb, 16); mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain; - mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; + mcl[0].args[0] = MMAP_VADDR(pending_idx); mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL; mcl[0].args[2] = 0; mcl[0].args[3] = netif->domid; diff -r ea98f0bb6510 -r 9f7935ea4606 linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c Tue Feb 08 12:27:23 2005 +0000 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c Tue Feb 08 15:13:51 2005 +0000 @@ -392,7 +392,7 @@ static void network_alloc_rx_buffers(str = INVALID_P2M_ENTRY; rx_mcl[i].op = __HYPERVISOR_update_va_mapping; - rx_mcl[i].args[0] = (unsigned long)skb->head >> PAGE_SHIFT; + rx_mcl[i].args[0] = (unsigned long)skb->head; rx_mcl[i].args[1] = 0; rx_mcl[i].args[2] = 0; } @@ -593,7 +593,7 @@ static int netif_poll(struct net_device mmu->val = __pa(skb->head) >> PAGE_SHIFT; mmu++; mcl->op = __HYPERVISOR_update_va_mapping; - mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT; + mcl->args[0] = (unsigned long)skb->head; mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL; mcl->args[2] = 0; mcl++; diff -r ea98f0bb6510 -r 9f7935ea4606 linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c --- a/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c Tue Feb 08 12:27:23 2005 +0000 +++ b/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c Tue Feb 08 15:13:51 2005 +0000 @@ -191,7 +191,7 @@ static void fast_flush_area(int idx, int for ( i = 0; i < nr_pages; i++ ) { mcl[i].op = __HYPERVISOR_update_va_mapping; - mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT; + mcl[i].args[0] = MMAP_VADDR(idx, i); mcl[i].args[1] = 0; mcl[i].args[2] = 0; } @@ -630,7 +630,7 @@ static void dispatch_usb_io(usbif_priv_t i++, offset += PAGE_SIZE ) { mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; - mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; + mcl[i].args[0] = MMAP_VADDR(pending_idx, i); mcl[i].args[1] = ((buffer_mach & PAGE_MASK) + offset) | remap_prot; mcl[i].args[2] = 0; mcl[i].args[3] = up->domid; @@ -646,7 +646,7 @@ static void dispatch_usb_io(usbif_priv_t { /* Map in ISO schedule, if necessary. */ mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; - mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; + mcl[i].args[0] = MMAP_VADDR(pending_idx, i); mcl[i].args[1] = (req->iso_schedule & PAGE_MASK) | remap_prot; mcl[i].args[2] = 0; mcl[i].args[3] = up->domid; diff -r ea98f0bb6510 -r 9f7935ea4606 linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h --- a/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h Tue Feb 08 12:27:23 2005 +0000 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h Tue Feb 08 15:13:51 2005 +0000 @@ -426,7 +426,7 @@ extern pte_t *lookup_address(unsigned lo if (__dirty) { \ if ( likely((__vma)->vm_mm == current->mm) ) { \ xen_flush_page_update_queue(); \ - HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT, (__entry), UVMF_INVLPG); \ + HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \ } else { \ xen_l1_entry_update((__ptep), (__entry).pte_low); \ flush_tlb_page((__vma), (__address)); \ @@ -445,7 +445,7 @@ do { \ do { \ if (likely((__vma)->vm_mm == current->mm)) { \ xen_flush_page_update_queue(); \ - HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT, \ + HYPERVISOR_update_va_mapping((__address), \ __entry, 0); \ } else { \ xen_l1_entry_update((__ptep), (__entry).pte_low); \ diff -r ea98f0bb6510 -r 9f7935ea4606 linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h --- a/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h Tue Feb 08 12:27:23 2005 +0000 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h Tue Feb 08 15:13:51 2005 +0000 @@ -438,7 +438,7 @@ HYPERVISOR_multicall( static inline int HYPERVISOR_update_va_mapping( - unsigned long page_nr, pte_t new_val, unsigned long flags) + unsigned long nr, pte_t new_val, unsigned long flags) { int ret; unsigned long ign1, ign2, ign3; @@ -447,13 +447,13 @@ HYPERVISOR_update_va_mapping( TRAP_INSTR : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) : "0" (__HYPERVISOR_update_va_mapping), - "1" (page_nr), "2" ((new_val).pte_low), "3" (flags) + "1" (va), "2" ((new_val).pte_low), "3" (flags) : "memory" ); if ( unlikely(ret < 0) ) { printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n", - page_nr, (new_val).pte_low, flags); + va, (new_val).pte_low, flags); BUG(); } @@ -540,7 +540,7 @@ HYPERVISOR_grant_table_op( static inline int HYPERVISOR_update_va_mapping_otherdomain( - unsigned long page_nr, pte_t new_val, unsigned long flags, domid_t domid) + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) { int ret; unsigned long ign1, ign2, ign3, ign4; @@ -549,7 +549,7 @@ HYPERVISOR_update_va_mapping_otherdomain TRAP_INSTR : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) : "0" (__HYPERVISOR_update_va_mapping_otherdomain), - "1" (page_nr), "2" ((new_val).pte_low), "3" (flags), "4" (domid) : + "1" (va), "2" ((new_val).pte_low), "3" (flags), "4" (domid) : "memory" ); return ret; diff -r ea98f0bb6510 -r 9f7935ea4606 netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h --- a/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h Tue Feb 08 12:27:23 2005 +0000 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h Tue Feb 08 15:13:51 2005 +0000 @@ -398,7 +398,7 @@ HYPERVISOR_multicall(void *call_list, in } static inline int -HYPERVISOR_update_va_mapping(unsigned long page_nr, unsigned long new_val, +HYPERVISOR_update_va_mapping(unsigned long va, unsigned long new_val, unsigned long flags) { int ret; @@ -408,12 +408,12 @@ HYPERVISOR_update_va_mapping(unsigned lo TRAP_INSTR : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) : "0" (__HYPERVISOR_update_va_mapping), - "1" (page_nr), "2" (new_val), "3" (flags) + "1" (va), "2" (new_val), "3" (flags) : "memory" ); if (__predict_false(ret < 0)) panic("Failed update VA mapping: %08lx, %08lx, %08lx", - page_nr, new_val, flags); + va, new_val, flags); return ret; } @@ -494,7 +494,7 @@ HYPERVISOR_grant_table_op(unsigned int c } static inline int -HYPERVISOR_update_va_mapping_otherdomain(unsigned long page_nr, +HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, unsigned long new_val, unsigned long flags, domid_t domid) { int ret; @@ -504,7 +504,7 @@ HYPERVISOR_update_va_mapping_otherdomain TRAP_INSTR : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) : "0" (__HYPERVISOR_update_va_mapping_otherdomain), - "1" (page_nr), "2" (new_val), "3" (flags), "4" (domid) : + "1" (va), "2" (new_val), "3" (flags), "4" (domid) : "memory" ); return ret; diff -r ea98f0bb6510 -r 9f7935ea4606 netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c --- a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c Tue Feb 08 12:27:23 2005 +0000 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c Tue Feb 08 15:13:51 2005 +0000 @@ -580,7 +580,7 @@ xennet_rx_push_buffer(struct xennet_soft INVALID_P2M_ENTRY; rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping; - rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT; + rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va; rx_mcl[nr_pfns].args[1] = 0; rx_mcl[nr_pfns].args[2] = 0; @@ -679,7 +679,7 @@ xen_network_handler(void *arg) mmu->val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT; mmu++; mcl->op = __HYPERVISOR_update_va_mapping; - mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT; + mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va; mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW; mcl->args[2] = UVMF_FLUSH_TLB; // 0; mcl++; @@ -872,7 +872,7 @@ network_alloc_rx_buffers(struct xennet_s INVALID_P2M_ENTRY; rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping; - rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT; + rx_mcl[nr_pfns].args[0] = va; rx_mcl[nr_pfns].args[1] = 0; rx_mcl[nr_pfns].args[2] = 0; diff -r ea98f0bb6510 -r 9f7935ea4606 xen/arch/x86/memory.c --- a/xen/arch/x86/memory.c Tue Feb 08 12:27:23 2005 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2594 +0,0 @@ -/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ -/****************************************************************************** - * arch/x86/memory.c - * - * Copyright (c) 2002-2004 K A Fraser - * Copyright (c) 2004 Christian Limpach - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -/* - * A description of the x86 page table API: - * - * Domains trap to do_mmu_update with a list of update requests. - * This is a list of (ptr, val) pairs, where the requested operation - * is *ptr = val. - * - * Reference counting of pages: - * ---------------------------- - * Each page has two refcounts: tot_count and type_count. - * - * TOT_COUNT is the obvious reference count. It counts all uses of a - * physical page frame by a domain, including uses as a page directory, - * a page table, or simple mappings via a PTE. This count prevents a - * domain from releasing a frame back to the free pool when it still holds - * a reference to it. - * - * TYPE_COUNT is more subtle. A frame can be put to one of three - * mutually-exclusive uses: it might be used as a page directory, or a - * page table, or it may be mapped writable by the domain [of course, a - * frame may not be used in any of these three ways!]. - * So, type_count is a count of the number of times a frame is being - * referred to in its current incarnation. Therefore, a page can only - * change its type when its type count is zero. - * - * Pinning the page type: - * ---------------------- - * The type of a page can be pinned/unpinned with the commands - * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, - * pinning is not reference counted, so it can't be nested). - * This is useful to prevent a page's type count falling to zero, at which - * point safety checks would need to be carried out next time the count - * is increased again. - * - * A further note on writable page mappings: - * ----------------------------------------- - * For simplicity, the count of writable mappings for a page may not - * correspond to reality. The 'writable count' is incremented for every - * PTE which maps the page with the _PAGE_RW flag set. However, for - * write access to be possible the page directory entry must also have - * its _PAGE_RW bit set. We do not check this as it complicates the - * reference counting considerably [consider the case of multiple - * directory entries referencing a single page table, some with the RW - * bit set, others not -- it starts getting a bit messy]. - * In normal use, this simplification shouldn't be a problem. - * However, the logic can be added if required. - * - * One more note on read-only page mappings: - * ----------------------------------------- - * We want domains to be able to map pages for read-only access. The - * main reason is that page tables and directories should be readable - * by a domain, but it would not be safe for them to be writable. - * However, domains have free access to rings 1 & 2 of the Intel - * privilege model. In terms of page protection, these are considered - * to be part of 'supervisor mode'. The WP bit in CR0 controls whether - * read-only restrictions are respected in supervisor mode -- if the - * bit is clear then any mapped page is writable. - * - * We get round this by always setting the WP bit and disallowing - * updates to it. This is very unlikely to cause a problem for guest - * OS's, which will generally use the WP bit to simplify copy-on-write - * implementation (in that case, OS wants a fault when it writes to - * an application-supplied buffer). - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef VERBOSE -#define MEM_LOG(_f, _a...) \ - printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \ - current->domain->id , __LINE__ , ## _a ) -#else -#define MEM_LOG(_f, _a...) ((void)0) -#endif - -static int alloc_l2_table(struct pfn_info *page); -static int alloc_l1_table(struct pfn_info *page); -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d); -static int get_page_and_type_from_pagenr(unsigned long page_nr, - u32 type, - struct domain *d); - -static void free_l2_table(struct pfn_info *page); -static void free_l1_table(struct pfn_info *page); - -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); -static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); - -/* Used to defer flushing of memory structures. */ -static struct { -#define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */ -#define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */ - unsigned long deferred_ops; - /* If non-NULL, specifies a foreign subject domain for some operations. */ - struct domain *foreign; -} __cacheline_aligned percpu_info[NR_CPUS]; - -/* - * Returns the current foreign domain; defaults to the currently-executing - * domain if a foreign override hasn't been specified. - */ -#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain) - -/* Private domain structs for DOMID_XEN and DOMID_IO. */ -static struct domain *dom_xen, *dom_io; - -/* Frame table and its size in pages. */ -struct pfn_info *frame_table; -unsigned long frame_table_size; -unsigned long max_page; - -void __init init_frametable(void) -{ - unsigned long i, p; - - frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START; - frame_table_size = max_page * sizeof(struct pfn_info); - frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; - - for ( i = 0; i < frame_table_size; i += (4UL << 20) ) - { - p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20); - if ( p == 0 ) - panic("Not enough memory for frame table\n"); - map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, - 4UL << 20, PAGE_HYPERVISOR); - } - - memset(frame_table, 0, frame_table_size); -} - -void arch_init_memory(void) -{ - extern void subarch_init_memory(struct domain *); - - memset(percpu_info, 0, sizeof(percpu_info)); - - /* - * Initialise our DOMID_XEN domain. - * Any Xen-heap pages that we will allow to be mapped will have - * their domain field set to dom_xen. - */ - dom_xen = alloc_domain_struct(); - atomic_set(&dom_xen->refcnt, 1); - dom_xen->id = DOMID_XEN; - - /* - * Initialise our DOMID_IO domain. - * This domain owns no pages but is considered a special case when - * mapping I/O pages, as the mappings occur at the priv of the caller. - */ - dom_io = alloc_domain_struct(); - atomic_set(&dom_io->refcnt, 1); - dom_io->id = DOMID_IO; - - subarch_init_memory(dom_xen); -} - -void write_ptbase(struct exec_domain *ed) -{ - struct domain *d = ed->domain; - unsigned long pa; - -#ifdef CONFIG_VMX - if ( unlikely(shadow_mode(d)) ) - pa = ((shadow_mode(d) == SHM_full_32) ? - pagetable_val(ed->arch.monitor_table) : - pagetable_val(ed->arch.shadow_table)); - else - pa = pagetable_val(ed->arch.pagetable); -#else - if ( unlikely(shadow_mode(d)) ) - pa = pagetable_val(ed->arch.shadow_table); - else - pa = pagetable_val(ed->arch.pagetable); -#endif - - write_cr3(pa); -} - -static void __invalidate_shadow_ldt(struct exec_domain *d) -{ - int i; - unsigned long pfn; - struct pfn_info *page; - - d->arch.shadow_ldt_mapcnt = 0; - - for ( i = 16; i < 32; i++ ) - { - pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]); - if ( pfn == 0 ) continue; - d->arch.perdomain_ptes[i] = mk_l1_pgentry(0); - page = &frame_table[pfn]; - ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page); - ASSERT_PAGE_IS_DOMAIN(page, d->domain); - put_page_and_type(page); - } - - /* Dispose of the (now possibly invalid) mappings from the TLB. */ - percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT; -} - - -static inline void invalidate_shadow_ldt(struct exec_domain *d) -{ - if ( d->arch.shadow_ldt_mapcnt != 0 ) - __invalidate_shadow_ldt(d); -} - - -static int alloc_segdesc_page(struct pfn_info *page) -{ - struct desc_struct *descs; - int i; - - descs = map_domain_mem((page-frame_table) << PAGE_SHIFT); - - for ( i = 0; i < 512; i++ ) - if ( unlikely(!check_descriptor(&descs[i])) ) - goto fail; - - unmap_domain_mem(descs); - return 1; - - fail: - unmap_domain_mem(descs); - return 0; -} - - -/* Map shadow page at offset @off. */ -int map_ldt_shadow_page(unsigned int off) -{ - struct exec_domain *ed = current; - struct domain *d = ed->domain; - unsigned long l1e; - - if ( unlikely(in_irq()) ) - BUG(); - - __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->arch.ldt_base >> - PAGE_SHIFT) + off]); - - if ( unlikely(!(l1e & _PAGE_PRESENT)) || - unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], - d, PGT_ldt_page)) ) - return 0; - - ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW); - ed->arch.shadow_ldt_mapcnt++; - - return 1; -} - - -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d) -{ - struct pfn_info *page = &frame_table[page_nr]; - - if ( unlikely(!pfn_is_ram(page_nr)) ) - { - MEM_LOG("Pfn %08lx is not RAM", page_nr); - return 0; - } - - if ( unlikely(!get_page(page, d)) ) - { - MEM_LOG("Could not get page ref for pfn %08lx", page_nr); - return 0; - } - - return 1; -} - - -static int get_page_and_type_from_pagenr(unsigned long page_nr, - u32 type, - struct domain *d) -{ - struct pfn_info *page = &frame_table[page_nr]; - - if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) - return 0; - - if ( unlikely(!get_page_type(page, type)) ) - { -#ifdef VERBOSE - if ( (type & PGT_type_mask) != PGT_l1_page_table ) - MEM_LOG("Bad page type for pfn %08lx (%08x)", - page_nr, page->u.inuse.type_info); -#endif - put_page(page); - return 0; - } - - return 1; -} - - -/* - * We allow an L2 tables to map each other (a.k.a. linear page tables). It - * needs some special care with reference counst and access permissions: - * 1. The mapping entry must be read-only, or the guest may get write access - * to its own PTEs. - * 2. We must only bump the reference counts for an *already validated* - * L2 table, or we can end up in a deadlock in get_page_type() by waiting - * on a validation that is required to complete that validation. - * 3. We only need to increment the reference counts for the mapped page - * frame if it is mapped by a different L2 table. This is sufficient and - * also necessary to allow validation of an L2 table mapping itself. - */ -static int -get_linear_pagetable( - l2_pgentry_t l2e, unsigned long pfn, struct domain *d) -{ - u32 x, y; - struct pfn_info *page; - - if ( (l2_pgentry_val(l2e) & _PAGE_RW) ) - { - MEM_LOG("Attempt to create linear p.t. with write perms"); - return 0; - } - - if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) - { - /* Make sure the mapped frame belongs to the correct domain. */ - if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) ) - return 0; - - /* - * Make sure that the mapped frame is an already-validated L2 table. - * If so, atomically increment the count (checking for overflow). - */ - page = &frame_table[l2_pgentry_to_pagenr(l2e)]; - y = page->u.inuse.type_info; - do { - x = y; - if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || - unlikely((x & (PGT_type_mask|PGT_validated)) != - (PGT_l2_page_table|PGT_validated)) ) - { - put_page(page); - return 0; - } - } - while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); - } - - return 1; -} - - -static int -get_page_from_l1e( - l1_pgentry_t l1e, struct domain *d) -{ - unsigned long l1v = l1_pgentry_val(l1e); - unsigned long pfn = l1_pgentry_to_pagenr(l1e); - struct pfn_info *page = &frame_table[pfn]; - extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn); - - if ( !(l1v & _PAGE_PRESENT) ) - return 1; - - if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) ) - { - MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT)); - return 0; - } - - if ( unlikely(!pfn_is_ram(pfn)) ) - { - /* Revert to caller privileges if FD == DOMID_IO. */ - if ( d == dom_io ) - d = current->domain; - - if ( IS_PRIV(d) ) - return 1; - - if ( IS_CAPABLE_PHYSDEV(d) ) - return domain_iomem_in_pfn(d, pfn); - - MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn); - return 0; - } - - return ((l1v & _PAGE_RW) ? - get_page_and_type(page, d, PGT_writable_page) : - get_page(page, d)); -} - - -/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ -static int -get_page_from_l2e( - l2_pgentry_t l2e, unsigned long pfn, - struct domain *d, unsigned long va_idx) -{ - int rc; - - if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) - return 1; - - if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) ) - { - MEM_LOG("Bad L2 page type settings %04lx", - l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE)); - return 0; - } - - rc = get_page_and_type_from_pagenr( - l2_pgentry_to_pagenr(l2e), - PGT_l1_page_table | (va_idx<u.inuse.type_info & PGT_type_mask) == - PGT_ldt_page)) && - unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) ) - invalidate_shadow_ldt(e->exec_domain[0]); - put_page(page); - } -} - - -/* - * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. - * Note also that this automatically deals correctly with linear p.t.'s. - */ -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) -{ - if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && - ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) ) - put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]); -} - - -static int alloc_l2_table(struct pfn_info *page) -{ - struct domain *d = page_get_owner(page); - unsigned long page_nr = page_to_pfn(page); - l2_pgentry_t *pl2e; - int i; - - pl2e = map_domain_mem(page_nr << PAGE_SHIFT); - - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) - if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) ) - goto fail; - -#if defined(__i386__) - /* Now we add our private high mappings. */ - memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); - pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | - __PAGE_HYPERVISOR); -#endif - - unmap_domain_mem(pl2e); - return 1; - - fail: - while ( i-- > 0 ) - put_page_from_l2e(pl2e[i], page_nr); - - unmap_domain_mem(pl2e); - return 0; -} - - -static int alloc_l1_table(struct pfn_info *page) -{ - struct domain *d = page_get_owner(page); - unsigned long page_nr = page_to_pfn(page); - l1_pgentry_t *pl1e; - int i; - - pl1e = map_domain_mem(page_nr << PAGE_SHIFT); - - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) - if ( unlikely(!get_page_from_l1e(pl1e[i], d)) ) - goto fail; - - unmap_domain_mem(pl1e); - return 1; - - fail: - while ( i-- > 0 ) - put_page_from_l1e(pl1e[i], d); - - unmap_domain_mem(pl1e); - return 0; -} - - -static void free_l2_table(struct pfn_info *page) -{ - unsigned long page_nr = page - frame_table; - l2_pgentry_t *pl2e; - int i; - - pl2e = map_domain_mem(page_nr << PAGE_SHIFT); - - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) - put_page_from_l2e(pl2e[i], page_nr); - - unmap_domain_mem(pl2e); -} - - -static void free_l1_table(struct pfn_info *page) -{ - struct domain *d = page_get_owner(page); - unsigned long page_nr = page - frame_table; - l1_pgentry_t *pl1e; - int i; - - pl1e = map_domain_mem(page_nr << PAGE_SHIFT); - - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) - put_page_from_l1e(pl1e[i], d); - - unmap_domain_mem(pl1e); -} - - -static inline int update_l2e(l2_pgentry_t *pl2e, - l2_pgentry_t ol2e, - l2_pgentry_t nl2e) -{ - unsigned long o = cmpxchg((unsigned long *)pl2e, - l2_pgentry_val(ol2e), - l2_pgentry_val(nl2e)); - if ( o != l2_pgentry_val(ol2e) ) - MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", - l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o); - return (o == l2_pgentry_val(ol2e)); -} - - -/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ -static int mod_l2_entry(l2_pgentry_t *pl2e, - l2_pgentry_t nl2e, - unsigned long pfn) -{ - l2_pgentry_t ol2e; - unsigned long _ol2e; - - if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >= - DOMAIN_ENTRIES_PER_L2_PAGETABLE) ) - { - MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e); - return 0; - } - - if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) ) - return 0; - ol2e = mk_l2_pgentry(_ol2e); - - if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT ) - { - /* Differ in mapping (bits 12-31) or presence (bit 0)? */ - if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 ) - return update_l2e(pl2e, ol2e, nl2e); - - if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, - ((unsigned long)pl2e & - ~PAGE_MASK) >> 2)) ) - return 0; - - if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) - { - put_page_from_l2e(nl2e, pfn); - return 0; - } - - put_page_from_l2e(ol2e, pfn); - return 1; - } - - if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) - return 0; - - put_page_from_l2e(ol2e, pfn); - return 1; -} - - -static inline int update_l1e(l1_pgentry_t *pl1e, - l1_pgentry_t ol1e, - l1_pgentry_t nl1e) -{ - unsigned long o = l1_pgentry_val(ol1e); - unsigned long n = l1_pgentry_val(nl1e); - - if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) || - unlikely(o != l1_pgentry_val(ol1e)) ) - { - MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", - l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o); - return 0; - } - - return 1; -} - - -/* Update the L1 entry at pl1e to new value nl1e. */ -static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) -{ - l1_pgentry_t ol1e; - unsigned long _ol1e; - struct domain *d = current->domain; - - if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) - { - MEM_LOG("Bad get_user\n"); - return 0; - } - - ol1e = mk_l1_pgentry(_ol1e); - - if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT ) - { - /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */ - if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 ) - return update_l1e(pl1e, ol1e, nl1e); - - if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) - return 0; - - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) - { - put_page_from_l1e(nl1e, d); - return 0; - } - - put_page_from_l1e(ol1e, d); - return 1; - } - - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) - return 0; - - put_page_from_l1e(ol1e, d); - return 1; -} - - -int alloc_page_type(struct pfn_info *page, unsigned int type) -{ - switch ( type ) - { - case PGT_l1_page_table: - return alloc_l1_table(page); - case PGT_l2_page_table: - return alloc_l2_table(page); - case PGT_gdt_page: - case PGT_ldt_page: - return alloc_segdesc_page(page); - default: - printk("Bad type in alloc_page_type %x t=%x c=%x\n", - type, page->u.inuse.type_info, - page->count_info); - BUG(); - } - - return 0; -} - - -void free_page_type(struct pfn_info *page, unsigned int type) -{ - struct domain *d = page_get_owner(page); - - switch ( type ) - { - case PGT_l1_page_table: - free_l1_table(page); - break; - - case PGT_l2_page_table: - free_l2_table(page); - break; - - default: - BUG(); - } - - if ( unlikely(shadow_mode(d)) && - (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) ) - { - unshadow_table(page_to_pfn(page), type); - put_shadow_status(d); - } -} - - -void put_page_type(struct pfn_info *page) -{ - u32 nx, x, y = page->u.inuse.type_info; - - again: - do { - x = y; - nx = x - 1; - - ASSERT((x & PGT_count_mask) != 0); - - /* - * The page should always be validated while a reference is held. The - * exception is during domain destruction, when we forcibly invalidate - * page-table pages if we detect a referential loop. - * See domain.c:relinquish_list(). - */ - ASSERT((x & PGT_validated) || - test_bit(DF_DYING, &page_get_owner(page)->d_flags)); - - if ( unlikely((nx & PGT_count_mask) == 0) ) - { - /* Record TLB information for flush later. Races are harmless. */ - page->tlbflush_timestamp = tlbflush_current_time(); - - if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && - likely(nx & PGT_validated) ) - { - /* - * Page-table pages must be unvalidated when count is zero. The - * 'free' is safe because the refcnt is non-zero and validated - * bit is clear => other ops will spin or fail. - */ - if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, - x & ~PGT_validated)) != x) ) - goto again; - /* We cleared the 'valid bit' so we do the clear up. */ - free_page_type(page, x & PGT_type_mask); - /* Carry on, but with the 'valid bit' now clear. */ - x &= ~PGT_validated; - nx &= ~PGT_validated; - } - } - else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == - (PGT_pinned | 1)) ) - { - /* Page is now only pinned. Make the back pointer mutable again. */ - nx |= PGT_va_mutable; - } - } - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); -} - - -int get_page_type(struct pfn_info *page, u32 type) -{ - u32 nx, x, y = page->u.inuse.type_info; - - again: - do { - x = y; - nx = x + 1; - if ( unlikely((nx & PGT_count_mask) == 0) ) - { - MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page)); - return 0; - } - else if ( unlikely((x & PGT_count_mask) == 0) ) - { - if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) - { - /* - * On type change we check to flush stale TLB entries. This - * may be unnecessary (e.g., page was GDT/LDT) but those - * circumstances should be very rare. - */ - struct domain *d = page_get_owner(page); - if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor], - page->tlbflush_timestamp)) ) - { - perfc_incr(need_flush_tlb_flush); - flush_tlb_cpu(d->exec_domain[0]->processor); - } - - /* We lose existing type, back pointer, and validity. */ - nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); - nx |= type; - - /* No special validation needed for writable pages. */ - /* Page tables and GDT/LDT need to be scanned for validity. */ - if ( type == PGT_writable_page ) - nx |= PGT_validated; - } - } - else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) - { - if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) - { - if ( ((x & PGT_type_mask) != PGT_l2_page_table) || - ((type & PGT_type_mask) != PGT_l1_page_table) ) - MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n", - x & PGT_type_mask, type, page_to_pfn(page)); - return 0; - } - else if ( (x & PGT_va_mask) == PGT_va_mutable ) - { - /* The va backpointer is mutable, hence we update it. */ - nx &= ~PGT_va_mask; - nx |= type; /* we know the actual type is correct */ - } - else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) ) - { - /* This table is potentially mapped at multiple locations. */ - nx &= ~PGT_va_mask; - nx |= PGT_va_unknown; - } - } - else if ( unlikely(!(x & PGT_validated)) ) - { - /* Someone else is updating validation of this page. Wait... */ - while ( (y = page->u.inuse.type_info) == x ) - { - rep_nop(); - barrier(); - } - goto again; - } - } - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); - - if ( unlikely(!(nx & PGT_validated)) ) - { - /* Try to validate page type; drop the new reference on failure. */ - if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) - { - MEM_LOG("Error while validating pfn %08lx for type %08x." - " caf=%08x taf=%08x\n", - page_to_pfn(page), type, - page->count_info, - page->u.inuse.type_info); - /* Noone else can get a reference. We hold the only ref. */ - page->u.inuse.type_info = 0; - return 0; - } - - /* Noone else is updating simultaneously. */ - __set_bit(_PGT_validated, &page->u.inuse.type_info); - } - - return 1; -} - - -int new_guest_cr3(unsigned long pfn) -{ - struct exec_domain *ed = current; - struct domain *d = ed->domain; - int okay, cpu = smp_processor_id(); - unsigned long old_base_pfn; - - okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d); - if ( likely(okay) ) - { - invalidate_shadow_ldt(ed); - - percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB; - old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT; - ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT); - - shadow_mk_pagetable(ed); - - write_ptbase(ed); - - put_page_and_type(&frame_table[old_base_pfn]); - } - else - { - MEM_LOG("Error while installing new baseptr %08lx", pfn); - } - - return okay; -} - -static int do_extended_command(unsigned long ptr, unsigned long val) -{ - int okay = 1, cpu = smp_processor_id(); - unsigned int cmd = val & MMUEXT_CMD_MASK; - unsigned long pfn = ptr >> PAGE_SHIFT; - struct pfn_info *page = &frame_table[pfn]; - struct exec_domain *ed = current; - struct domain *d = ed->domain, *nd, *e; - u32 x, y; - domid_t domid; - grant_ref_t gntref; - - switch ( cmd ) - { - case MMUEXT_PIN_L1_TABLE: - case MMUEXT_PIN_L2_TABLE: - /* - * We insist that, if you pin an L1 page, it's the first thing that - * you do to it. This is because we require the backptr to still be - * mutable. This assumption seems safe. - */ - okay = get_page_and_type_from_pagenr( - pfn, - ((cmd==MMUEXT_PIN_L2_TABLE) ? - PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)), - FOREIGNDOM); - - if ( unlikely(!okay) ) - { - MEM_LOG("Error while pinning pfn %08lx", pfn); - break; - } - - if ( unlikely(test_and_set_bit(_PGT_pinned, - &page->u.inuse.type_info)) ) - { - MEM_LOG("Pfn %08lx already pinned", pfn); - put_page_and_type(page); - okay = 0; - break; - } - - break; - - case MMUEXT_UNPIN_TABLE: - if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) ) - { - MEM_LOG("Page %08lx bad domain (dom=%p)", - ptr, page_get_owner(page)); - } - else if ( likely(test_and_clear_bit(_PGT_pinned, - &page->u.inuse.type_info)) ) - { - put_page_and_type(page); - put_page(page); - } - else - { - okay = 0; - put_page(page); - MEM_LOG("Pfn %08lx not pinned", pfn); - } - break; - - case MMUEXT_NEW_BASEPTR: - okay = new_guest_cr3(pfn); - break; - - case MMUEXT_TLB_FLUSH: - percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB; - break; - - case MMUEXT_INVLPG: - __flush_tlb_one(ptr); - break; - - case MMUEXT_FLUSH_CACHE: - if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) ) - { - MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n"); - okay = 0; - } - else - { - wbinvd(); - } - break; - - case MMUEXT_SET_LDT: - { - unsigned long ents = val >> MMUEXT_CMD_SHIFT; - if ( ((ptr & (PAGE_SIZE-1)) != 0) || - (ents > 8192) || - ((ptr+ents*LDT_ENTRY_SIZE) < ptr) || - ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) ) - { - okay = 0; - MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents); - } - else if ( (ed->arch.ldt_ents != ents) || - (ed->arch.ldt_base != ptr) ) - { - invalidate_shadow_ldt(ed); - ed->arch.ldt_base = ptr; - ed->arch.ldt_ents = ents; - load_LDT(ed); - percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT; - if ( ents != 0 ) - percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT; - } - break; - } - - case MMUEXT_SET_FOREIGNDOM: - domid = (domid_t)(val >> 16); - - if ( (e = percpu_info[cpu].foreign) != NULL ) - put_domain(e); - percpu_info[cpu].foreign = NULL; - - if ( !IS_PRIV(d) ) - { - switch ( domid ) - { - case DOMID_IO: - get_knownalive_domain(dom_io); - percpu_info[cpu].foreign = dom_io; - break; - default: - MEM_LOG("Dom %u cannot set foreign dom\n", d->id); - okay = 0; - break; - } - } - else - { - percpu_info[cpu].foreign = e = find_domain_by_id(domid); - if ( e == NULL ) - { - switch ( domid ) - { - case DOMID_XEN: - get_knownalive_domain(dom_xen); - percpu_info[cpu].foreign = dom_xen; - break; - case DOMID_IO: - get_knownalive_domain(dom_io); - percpu_info[cpu].foreign = dom_io; - break; - default: - MEM_LOG("Unknown domain '%u'", domid); - okay = 0; - break; - } - } - } - break; - - case MMUEXT_TRANSFER_PAGE: - domid = (domid_t)(val >> 16); - gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF)); - - if ( unlikely(IS_XEN_HEAP_FRAME(page)) || - unlikely(!pfn_is_ram(pfn)) || - unlikely((e = find_domain_by_id(domid)) == NULL) ) - { - MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid); - okay = 0; - break; - } - - spin_lock(&d->page_alloc_lock); - - /* - * The tricky bit: atomically release ownership while there is just one - * benign reference to the page (PGC_allocated). If that reference - * disappears then the deallocation routine will safely spin. - */ - nd = page_get_owner(page); - y = page->count_info; - do { - x = y; - if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != - (1|PGC_allocated)) || - unlikely(nd != d) ) - { - MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," - " caf=%08x, taf=%08x\n", page_to_pfn(page), - d, d->id, nd, x, page->u.inuse.type_info); - spin_unlock(&d->page_alloc_lock); - put_domain(e); - return 0; - } - __asm__ __volatile__( - LOCK_PREFIX "cmpxchg8b %2" - : "=d" (nd), "=a" (y), - "=m" (*(volatile u64 *)(&page->count_info)) - : "0" (d), "1" (x), "c" (NULL), "b" (x) ); - } - while ( unlikely(nd != d) || unlikely(y != x) ); - - /* - * Unlink from 'd'. At least one reference remains (now anonymous), so - * noone else is spinning to try to delete this page from 'd'. - */ - d->tot_pages--; - list_del(&page->list); - - spin_unlock(&d->page_alloc_lock); - - spin_lock(&e->page_alloc_lock); - - /* - * Check that 'e' will accept the page and has reservation headroom. - * Also, a domain mustn't have PGC_allocated pages when it is dying. - */ - ASSERT(e->tot_pages <= e->max_pages); - if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || - unlikely(e->tot_pages == e->max_pages) || - unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) ) - { - MEM_LOG("Transferee has no reservation headroom (%d,%d), or " - "provided a bad grant ref, or is dying (%08lx).\n", - e->tot_pages, e->max_pages, e->d_flags); - spin_unlock(&e->page_alloc_lock); - put_domain(e); - okay = 0; - break; - } - - /* Okay, add the page to 'e'. */ - if ( unlikely(e->tot_pages++ == 0) ) - get_knownalive_domain(e); - list_add_tail(&page->list, &e->page_list); - page_set_owner(page, e); - - spin_unlock(&e->page_alloc_lock); - - /* Transfer is all done: tell the guest about its new page frame. */ - gnttab_notify_transfer(e, gntref, pfn); - - put_domain(e); - break; - - case MMUEXT_REASSIGN_PAGE: - if ( unlikely(!IS_PRIV(d)) ) - { - MEM_LOG("Dom %u has no reassignment priv", d->id); - okay = 0; - break; - } - - e = percpu_info[cpu].foreign; - if ( unlikely(e == NULL) ) - { - MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn); - okay = 0; - break; - } - - /* - * Grab both page_list locks, in order. This prevents the page from - * disappearing elsewhere while we modify the owner, and we'll need - * both locks if we're successful so that we can change lists. - */ - if ( d < e ) - { - spin_lock(&d->page_alloc_lock); - spin_lock(&e->page_alloc_lock); - } - else - { - spin_lock(&e->page_alloc_lock); - spin_lock(&d->page_alloc_lock); - } - - /* A domain shouldn't have PGC_allocated pages when it is dying. */ - if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || - unlikely(IS_XEN_HEAP_FRAME(page)) ) - { - MEM_LOG("Reassignment page is Xen heap, or dest dom is dying."); - okay = 0; - goto reassign_fail; - } - - /* - * The tricky bit: atomically change owner while there is just one - * benign reference to the page (PGC_allocated). If that reference - * disappears then the deallocation routine will safely spin. - */ - nd = page_get_owner(page); - y = page->count_info; - do { - x = y; - if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != - (1|PGC_allocated)) || - unlikely(nd != d) ) - { - MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," - " caf=%08x, taf=%08x\n", page_to_pfn(page), - d, d->id, nd, x, page->u.inuse.type_info); - okay = 0; - goto reassign_fail; - } - __asm__ __volatile__( - LOCK_PREFIX "cmpxchg8b %3" - : "=d" (nd), "=a" (y), "=c" (e), - "=m" (*(volatile u64 *)(&page->count_info)) - : "0" (d), "1" (x), "c" (e), "b" (x) ); - } - while ( unlikely(nd != d) || unlikely(y != x) ); - - /* - * Unlink from 'd'. We transferred at least one reference to 'e', so - * noone else is spinning to try to delete this page from 'd'. - */ - d->tot_pages--; - list_del(&page->list); - - /* - * Add the page to 'e'. Someone may already have removed the last - * reference and want to remove the page from 'e'. However, we have - * the lock so they'll spin waiting for us. - */ - if ( unlikely(e->tot_pages++ == 0) ) - get_knownalive_domain(e); - list_add_tail(&page->list, &e->page_list); - - reassign_fail: - spin_unlock(&d->page_alloc_lock); - spin_unlock(&e->page_alloc_lock); - break; - - case MMUEXT_CLEAR_FOREIGNDOM: - if ( (e = percpu_info[cpu].foreign) != NULL ) - put_domain(e); - percpu_info[cpu].foreign = NULL; - break; - - default: - MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK); - okay = 0; - break; - } - - return okay; -} - -int do_mmu_update( - mmu_update_t *ureqs, unsigned int count, unsigned int *pdone) -{ -/* - * We steal the m.s.b. of the @count parameter to indicate whether this - * invocation of do_mmu_update() is resuming a previously preempted call. - * We steal the next 15 bits to remember the current FOREIGNDOM. - */ -#define MMU_UPDATE_PREEMPTED (~(~0U>>1)) -#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16) -#define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<domain; - u32 type_info; - domid_t domid; - - LOCK_BIGLOCK(d); - - cleanup_writable_pagetable(d); - - if ( unlikely(shadow_mode(d)) ) - check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */ - - /* - * If we are resuming after preemption, read how much work we have already - * done. This allows us to set the @done output parameter correctly. - * We also reset FOREIGNDOM here. - */ - if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) ) - { - if ( !(count & MMU_UPDATE_PREEMPTED) ) - { - /* Count overflow into private FOREIGNDOM field. */ - MEM_LOG("do_mmu_update count is too large"); - rc = -EINVAL; - goto out; - } - count &= ~MMU_UPDATE_PREEMPTED; - domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT; - count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK; - if ( unlikely(pdone != NULL) ) - (void)get_user(done, pdone); - if ( (domid != current->domain->id) && - !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) ) - { - rc = -EINVAL; - goto out; - } - } - - perfc_incrc(calls_to_mmu_update); - perfc_addc(num_page_updates, count); - - if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) ) - { - rc = -EFAULT; - goto out; - } - - for ( i = 0; i < count; i++ ) - { - if ( hypercall_preempt_check() ) - { - rc = hypercall3_create_continuation( - __HYPERVISOR_mmu_update, ureqs, - (count - i) | - (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | - MMU_UPDATE_PREEMPTED, pdone); - break; - } - - if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) ) - { - MEM_LOG("Bad __copy_from_user"); - rc = -EFAULT; - break; - } - - cmd = req.ptr & (sizeof(l1_pgentry_t)-1); - pfn = req.ptr >> PAGE_SHIFT; - - okay = 0; - - switch ( cmd ) - { - /* - * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. - */ - case MMU_NORMAL_PT_UPDATE: - if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) ) - { - MEM_LOG("Could not get page for normal update"); - break; - } - - if ( likely(prev_pfn == pfn) ) - { - va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); - } - else - { - if ( prev_pfn != 0 ) - unmap_domain_mem((void *)va); - va = (unsigned long)map_domain_mem(req.ptr); - prev_pfn = pfn; - } - - page = &frame_table[pfn]; - switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) - { - case PGT_l1_page_table: - if ( likely(get_page_type( - page, type_info & (PGT_type_mask|PGT_va_mask))) ) - { - okay = mod_l1_entry((l1_pgentry_t *)va, - mk_l1_pgentry(req.val)); - - if ( unlikely(shadow_mode(d)) && okay && - (get_shadow_status(d, page-frame_table) & - PSH_shadowed) ) - { - shadow_l1_normal_pt_update( - req.ptr, req.val, &prev_smfn, &prev_spl1e); - put_shadow_status(d); - } - - put_page_type(page); - } - break; - case PGT_l2_page_table: - if ( likely(get_page_type(page, PGT_l2_page_table)) ) - { - okay = mod_l2_entry((l2_pgentry_t *)va, - mk_l2_pgentry(req.val), - pfn); - - if ( unlikely(shadow_mode(d)) && okay && - (get_shadow_status(d, page-frame_table) & - PSH_shadowed) ) - { - shadow_l2_normal_pt_update(req.ptr, req.val); - put_shadow_status(d); - } - - put_page_type(page); - } - break; - default: - if ( likely(get_page_type(page, PGT_writable_page)) ) - { - *(unsigned long *)va = req.val; - okay = 1; - put_page_type(page); - } - break; - } - - put_page(page); - break; - - case MMU_MACHPHYS_UPDATE: - if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) ) - { - MEM_LOG("Could not get page for mach->phys update"); - break; - } - - machine_to_phys_mapping[pfn] = req.val; - okay = 1; - - /* - * If in log-dirty mode, mark the corresponding pseudo-physical - * page as dirty. - */ - if ( unlikely(shadow_mode(d) == SHM_logdirty) && - mark_dirty(d, pfn) ) - d->arch.shadow_dirty_block_count++; - - put_page(&frame_table[pfn]); - break; - - /* - * MMU_EXTENDED_COMMAND: Extended command is specified - * in the least-siginificant bits of the 'value' field. - */ - case MMU_EXTENDED_COMMAND: - req.ptr &= ~(sizeof(l1_pgentry_t) - 1); - okay = do_extended_command(req.ptr, req.val); - break; - - default: - MEM_LOG("Invalid page update command %08lx", req.ptr); - break; - } - - if ( unlikely(!okay) ) - { - rc = -EINVAL; - break; - } - - ureqs++; - } - - out: - if ( prev_pfn != 0 ) - unmap_domain_mem((void *)va); - - if ( unlikely(prev_spl1e != 0) ) - unmap_domain_mem((void *)prev_spl1e); - - deferred_ops = percpu_info[cpu].deferred_ops; - percpu_info[cpu].deferred_ops = 0; - - if ( deferred_ops & DOP_FLUSH_TLB ) - local_flush_tlb(); - - if ( deferred_ops & DOP_RELOAD_LDT ) - (void)map_ldt_shadow_page(0); - - if ( unlikely(percpu_info[cpu].foreign != NULL) ) - { - put_domain(percpu_info[cpu].foreign); - percpu_info[cpu].foreign = NULL; - } - - /* Add incremental work we have done to the @done output parameter. */ - if ( unlikely(pdone != NULL) ) - __put_user(done + i, pdone); - - if ( unlikely(shadow_mode(d)) ) - check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */ - - UNLOCK_BIGLOCK(d); - return rc; -} - - -int do_update_va_mapping(unsigned long page_nr, - unsigned long val, - unsigned long flags) -{ - struct exec_domain *ed = current; - struct domain *d = ed->domain; - int err = 0; - unsigned int cpu = ed->processor; - unsigned long deferred_ops; - - perfc_incrc(calls_to_update_va); - - if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) ) - return -EINVAL; - - LOCK_BIGLOCK(d); - - cleanup_writable_pagetable(d); - - /* - * XXX When we make this support 4MB superpages we should also deal with - * the case of updating L2 entries. - */ - - if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], - mk_l1_pgentry(val))) ) - err = -EINVAL; - - if ( unlikely(shadow_mode(d)) ) - { - unsigned long sval = 0; - - l1pte_propagate_from_guest(d, &val, &sval); - - if ( unlikely(__put_user(sval, ((unsigned long *)( - &shadow_linear_pg_table[page_nr])))) ) - { - /* - * Since L2's are guranteed RW, failure indicates the page was not - * shadowed, so ignore. - */ - perfc_incrc(shadow_update_va_fail); - } - - /* - * If we're in log-dirty mode then we need to note that we've updated - * the PTE in the PT-holding page. We need the machine frame number - * for this. - */ - if ( shadow_mode(d) == SHM_logdirty ) - mark_dirty(d, va_to_l1mfn(page_nr << PAGE_SHIFT)); - - check_pagetable(d, ed->arch.pagetable, "va"); /* debug */ - } - - deferred_ops = percpu_info[cpu].deferred_ops; - percpu_info[cpu].deferred_ops = 0; - - if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || - unlikely(flags & UVMF_FLUSH_TLB) ) - local_flush_tlb(); - else if ( unlikely(flags & UVMF_INVLPG) ) - __flush_tlb_one(page_nr << PAGE_SHIFT); - - if ( unlikely(deferred_ops & DOP_RELOAD_LDT) ) - (void)map_ldt_shadow_page(0); - - UNLOCK_BIGLOCK(d); - - return err; -} - -int do_update_va_mapping_otherdomain(unsigned long page_nr, - unsigned long val, - unsigned long flags, - domid_t domid) -{ - unsigned int cpu = smp_processor_id(); - struct domain *d; - int rc; - - if ( unlikely(!IS_PRIV(current->domain)) ) - return -EPERM; - - percpu_info[cpu].foreign = d = find_domain_by_id(domid); - if ( unlikely(d == NULL) ) - { - MEM_LOG("Unknown domain '%u'", domid); - return -ESRCH; - } - - rc = do_update_va_mapping(page_nr, val, flags); - - put_domain(d); - percpu_info[cpu].foreign = NULL; - - return rc; -} - - - -/************************* - * Descriptor Tables - */ - -void destroy_gdt(struct exec_domain *ed) -{ - int i; - unsigned long pfn; - - for ( i = 0; i < 16; i++ ) - { - if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 ) - put_page_and_type(&frame_table[pfn]); - ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0); - } -} - - -long set_gdt(struct exec_domain *ed, - unsigned long *frames, - unsigned int entries) -{ - struct domain *d = ed->domain; - /* NB. There are 512 8-byte entries per GDT page. */ - int i = 0, nr_pages = (entries + 511) / 512; - struct desc_struct *vgdt; - unsigned long pfn; - - /* Check the first page in the new GDT. */ - if ( (pfn = frames[0]) >= max_page ) - goto fail; - - /* The first page is special because Xen owns a range of entries in it. */ - if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) - { - /* GDT checks failed: try zapping the Xen reserved entries. */ - if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) ) - goto fail; - vgdt = map_domain_mem(pfn << PAGE_SHIFT); - memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0, - NR_RESERVED_GDT_ENTRIES*8); - unmap_domain_mem(vgdt); - put_page_and_type(&frame_table[pfn]); - - /* Okay, we zapped the entries. Now try the GDT checks again. */ - if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) - goto fail; - } - - /* Check the remaining pages in the new GDT. */ - for ( i = 1; i < nr_pages; i++ ) - if ( ((pfn = frames[i]) >= max_page) || - !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) - goto fail; - - /* Copy reserved GDT entries to the new GDT. */ - vgdt = map_domain_mem(frames[0] << PAGE_SHIFT); - memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, - gdt_table + FIRST_RESERVED_GDT_ENTRY, - NR_RESERVED_GDT_ENTRIES*8); - unmap_domain_mem(vgdt); - - /* Tear down the old GDT. */ - destroy_gdt(ed); - - /* Install the new GDT. */ - for ( i = 0; i < nr_pages; i++ ) - ed->arch.perdomain_ptes[i] = - mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); - - SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed)); - SET_GDT_ENTRIES(ed, entries); - - return 0; - - fail: - while ( i-- > 0 ) - put_page_and_type(&frame_table[frames[i]]); - return -EINVAL; -} - - -long do_set_gdt(unsigned long *frame_list, unsigned int entries) -{ - int nr_pages = (entries + 511) / 512; - unsigned long frames[16]; - long ret; - - if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) - return -EINVAL; - - if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) ) - return -EFAULT; - - LOCK_BIGLOCK(current->domain); - - if ( (ret = set_gdt(current, frames, entries)) == 0 ) - { - local_flush_tlb(); - __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt)); - } - - UNLOCK_BIGLOCK(current->domain); - - return ret; -} - - -long do_update_descriptor( - unsigned long pa, unsigned long word1, unsigned long word2) -{ - unsigned long pfn = pa >> PAGE_SHIFT; - struct desc_struct *gdt_pent, d; - struct pfn_info *page; - struct exec_domain *ed; - long ret = -EINVAL; - - d.a = (u32)word1; - d.b = (u32)word2; - - LOCK_BIGLOCK(current->domain); - - if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) { - UNLOCK_BIGLOCK(current->domain); - return -EINVAL; - } - - page = &frame_table[pfn]; - if ( unlikely(!get_page(page, current->domain)) ) { - UNLOCK_BIGLOCK(current->domain); - return -EINVAL; - } - - /* Check if the given frame is in use in an unsafe context. */ - switch ( page->u.inuse.type_info & PGT_type_mask ) - { - case PGT_gdt_page: - /* Disallow updates of Xen-reserved descriptors in the current GDT. */ - for_each_exec_domain(current->domain, ed) { - if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) && - (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && - (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) - goto out; - } - if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) - goto out; - break; - case PGT_ldt_page: - if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) - goto out; - break; - default: - if ( unlikely(!get_page_type(page, PGT_writable_page)) ) - goto out; - break; - } - - /* All is good so make the update. */ - gdt_pent = map_domain_mem(pa); - memcpy(gdt_pent, &d, 8); - unmap_domain_mem(gdt_pent); - - put_page_type(page); - - ret = 0; /* success */ - - out: - put_page(page); - - UNLOCK_BIGLOCK(current->domain); - - return ret; -} - - - -/************************* - * Writable Pagetables - */ - -ptwr_info_t ptwr_info[NR_CPUS]; - -#ifdef VERBOSE -int ptwr_debug = 0x0; -#define PTWR_PRINTK(_f, _a...) \ - do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 ) -#define PTWR_PRINT_WHICH (which ? 'I' : 'A') -#else -#define PTWR_PRINTK(_f, _a...) ((void)0) -#endif - -/* Flush the given writable p.t. page and write-protect it again. */ -void ptwr_flush(const int which) -{ - unsigned long sstat, spte, pte, *ptep, l1va; - l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e; - l2_pgentry_t *pl2e; - int i, cpu = smp_processor_id(); - struct exec_domain *ed = current; - struct domain *d = ed->domain; - - l1va = ptwr_info[cpu].ptinfo[which].l1va; - ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT]; - - /* - * STEP 1. Write-protect the p.t. page so no more updates can occur. - */ - - if ( unlikely(__get_user(pte, ptep)) ) - { - MEM_LOG("ptwr: Could not read pte at %p\n", ptep); - /* - * Really a bug. We could read this PTE during the initial fault, - * and pagetables can't have changed meantime. XXX Multi-CPU guests? - */ - BUG(); - } - PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n", - PTWR_PRINT_WHICH, ptep, pte); - pte &= ~_PAGE_RW; - - if ( unlikely(shadow_mode(d)) ) - { - /* Write-protect the p.t. page in the shadow page table. */ - l1pte_propagate_from_guest(d, &pte, &spte); - __put_user( - spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]); - - /* Is the p.t. page itself shadowed? Map it into Xen space if so. */ - sstat = get_shadow_status(d, pte >> PAGE_SHIFT); - if ( sstat & PSH_shadowed ) - sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT); - } - - /* Write-protect the p.t. page in the guest page table. */ - if ( unlikely(__put_user(pte, ptep)) ) - { - MEM_LOG("ptwr: Could not update pte at %p\n", ptep); - /* - * Really a bug. We could write this PTE during the initial fault, - * and pagetables can't have changed meantime. XXX Multi-CPU guests? - */ - BUG(); - } - - /* Ensure that there are no stale writable mappings in any TLB. */ - /* NB. INVLPG is a serialising instruction: flushes pending updates. */ -#if 1 - __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */ -#else - flush_tlb_all(); -#endif - PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n", - PTWR_PRINT_WHICH, ptep, pte); - - /* - * STEP 2. Validate any modified PTEs. - */ - - pl1e = ptwr_info[cpu].ptinfo[which].pl1e; - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) - { - ol1e = ptwr_info[cpu].ptinfo[which].page[i]; - nl1e = pl1e[i]; - - if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) ) - continue; - - /* - * Fast path for PTEs that have merely been write-protected - * (e.g., during a Unix fork()). A strict reduction in privilege. - */ - if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) ) - { - if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) ) - { - if ( unlikely(sl1e != NULL) ) - l1pte_propagate_from_guest( - d, &l1_pgentry_val(nl1e), - &l1_pgentry_val(sl1e[i])); - put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]); - } - continue; - } - - if ( unlikely(!get_page_from_l1e(nl1e, d)) ) - { - MEM_LOG("ptwr: Could not re-validate l1 page\n"); - /* - * Make the remaining p.t's consistent before crashing, so the - * reference counts are correct. - */ - memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i], - (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t)); - unmap_domain_mem(pl1e); - ptwr_info[cpu].ptinfo[which].l1va = 0; - UNLOCK_BIGLOCK(d); - domain_crash(); - } - - if ( unlikely(sl1e != NULL) ) - l1pte_propagate_from_guest( - d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i])); - - if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) ) - put_page_from_l1e(ol1e, d); - } - unmap_domain_mem(pl1e); - - /* - * STEP 3. Reattach the L1 p.t. page into the current address space. - */ - - if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) ) - { - pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx]; - *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); - } - - /* - * STEP 4. Final tidy-up. - */ - - ptwr_info[cpu].ptinfo[which].l1va = 0; - - if ( unlikely(sl1e != NULL) ) - { - unmap_domain_mem(sl1e); - put_shadow_status(d); - } -} - -/* Write page fault handler: check if guest is trying to modify a PTE. */ -int ptwr_do_page_fault(unsigned long addr) -{ - unsigned long pte, pfn, l2e; - struct pfn_info *page; - l2_pgentry_t *pl2e; - int which, cpu = smp_processor_id(); - u32 l2_idx; - - /* - * Attempt to read the PTE that maps the VA being accessed. By checking for - * PDE validity in the L2 we avoid many expensive fixups in __get_user(). - */ - if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) & - _PAGE_PRESENT) || - __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) ) - { - return 0; - } - - pfn = pte >> PAGE_SHIFT; - page = &frame_table[pfn]; - - /* We are looking only for read-only mappings of p.t. pages. */ - if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) || - ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ) - { - return 0; - } - - /* Get the L2 index at which this L1 p.t. is always mapped. */ - l2_idx = page->u.inuse.type_info & PGT_va_mask; - if ( unlikely(l2_idx >= PGT_va_unknown) ) - { - domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */ - } - l2_idx >>= PGT_va_shift; - - if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) ) - { - MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr); - domain_crash(); - } - - /* - * Is the L1 p.t. mapped into the current address space? If so we call it - * an ACTIVE p.t., otherwise it is INACTIVE. - */ - pl2e = &linear_l2_table[l2_idx]; - l2e = l2_pgentry_val(*pl2e); - which = PTWR_PT_INACTIVE; - if ( (l2e >> PAGE_SHIFT) == pfn ) - { - /* Check the PRESENT bit to set ACTIVE. */ - if ( likely(l2e & _PAGE_PRESENT) ) - which = PTWR_PT_ACTIVE; - else { - /* - * If the PRESENT bit is clear, we may be conflicting with - * the current ACTIVE p.t. (it may be the same p.t. mapped - * at another virt addr). - * The ptwr_flush call below will restore the PRESENT bit. - */ - if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va && - l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx ) - which = PTWR_PT_ACTIVE; - } - } - - PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, " - "pfn %08lx\n", PTWR_PRINT_WHICH, - addr, l2_idx << L2_PAGETABLE_SHIFT, pfn); - - /* - * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at - * time. If there is already one, we must flush it out. - */ - if ( ptwr_info[cpu].ptinfo[which].l1va ) - ptwr_flush(which); - - ptwr_info[cpu].ptinfo[which].l1va = addr | 1; - ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx; - - /* For safety, disconnect the L1 p.t. page from current space. */ - if ( (which == PTWR_PT_ACTIVE) && - likely(!shadow_mode(current->domain)) ) - { - *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT); -#if 1 - flush_tlb(); /* XXX Multi-CPU guests? */ -#else - flush_tlb_all(); -#endif - } - - /* Temporarily map the L1 page, and make a copy of it. */ - ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT); - memcpy(ptwr_info[cpu].ptinfo[which].page, - ptwr_info[cpu].ptinfo[which].pl1e, - ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t)); - - /* Finally, make the p.t. page writable by the guest OS. */ - pte |= _PAGE_RW; - PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH, - &linear_pg_table[addr>>PAGE_SHIFT], pte); - if ( unlikely(__put_user(pte, (unsigned long *) - &linear_pg_table[addr>>PAGE_SHIFT])) ) - { - MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *) - &linear_pg_table[addr>>PAGE_SHIFT]); - /* Toss the writable pagetable state and crash. */ - unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e); - ptwr_info[cpu].ptinfo[which].l1va = 0; - domain_crash(); - } - - return EXCRET_fault_fixed; -} - -static __init int ptwr_init(void) -{ - int i; - - for ( i = 0; i < smp_num_cpus; i++ ) - { - ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page = - (void *)alloc_xenheap_page(); - ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page = - (void *)alloc_xenheap_page(); - } - - return 0; -} -__initcall(ptwr_init); - - - - -/************************************************************************/ -/************************************************************************/ -/************************************************************************/ - -#ifndef NDEBUG - -void ptwr_status(void) -{ - unsigned long pte, *ptep, pfn; - struct pfn_info *page; - int cpu = smp_processor_id(); - - ptep = (unsigned long *)&linear_pg_table - [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT]; - - if ( __get_user(pte, ptep) ) { - MEM_LOG("ptwr: Could not read pte at %p\n", ptep); - domain_crash(); - } - - pfn = pte >> PAGE_SHIFT; - page = &frame_table[pfn]; - printk("need to alloc l1 page %p\n", page); - /* make pt page writable */ - printk("need to make read-only l1-page at %p is %08lx\n", - ptep, pte); - - if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 ) - return; - - if ( __get_user(pte, (unsigned long *) - ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) { - MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *) - ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va); - domain_crash(); - } - pfn = pte >> PAGE_SHIFT; - page = &frame_table[pfn]; -} - -void audit_domain(struct domain *d) -{ - int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; - - void adjust (struct pfn_info *page, int dir, int adjtype) - { - int count = page->count_info & PGC_count_mask; - - if ( adjtype ) - { - int tcount = page->u.inuse.type_info & PGT_count_mask; - - ttot++; - - tcount += dir; - - if ( tcount < 0 ) - { - /* This will only come out once. */ - printk("Audit %d: type count whent below zero pfn=%x " - "taf=%x otaf=%x\n", - d->id, page-frame_table, - page->u.inuse.type_info, - page->tlbflush_timestamp); - } - - page->u.inuse.type_info = - (page->u.inuse.type_info & ~PGT_count_mask) | - (tcount & PGT_count_mask); - } - - ctot++; - count += dir; - if ( count < 0 ) - { - /* This will only come out once. */ - printk("Audit %d: general count whent below zero pfn=%x " - "taf=%x otaf=%x\n", - d->id, page-frame_table, - page->u.inuse.type_info, - page->tlbflush_timestamp); - } - - page->count_info = - (page->count_info & ~PGC_count_mask) | - (count & PGC_count_mask); - - } - - void scan_for_pfn(struct domain *d, unsigned long xpfn) - { - unsigned long pfn, *pt; - struct list_head *list_ent; - struct pfn_info *page; - int i; - - list_ent = d->page_list.next; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - switch ( page->u.inuse.type_info & PGT_type_mask ) - { - case PGT_l1_page_table: - case PGT_l2_page_table: - pt = map_domain_mem(pfn<> PAGE_SHIFT) == xpfn) ) - printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n", - d->id, i, pfn, page->u.inuse.type_info, - page->count_info); - unmap_domain_mem(pt); - } - - list_ent = frame_table[pfn].list.next; - } - - } - - void scan_for_pfn_remote(unsigned long xpfn) - { - struct domain *e; - for_each_domain ( e ) - scan_for_pfn( e, xpfn ); - } - - int i; - unsigned long pfn; - struct list_head *list_ent; - struct pfn_info *page; - - if ( d != current->domain ) - domain_pause(d); - synchronise_pagetables(~0UL); - - printk("pt base=%lx sh_info=%x\n", - pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT, - virt_to_page(d->shared_info)-frame_table); - - spin_lock(&d->page_alloc_lock); - - /* PHASE 0 */ - - list_ent = d->page_list.next; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - if ( page_get_owner(page) != d ) - BUG(); - - if ( (page->u.inuse.type_info & PGT_count_mask) > - (page->count_info & PGC_count_mask) ) - printk("taf > caf %x %x pfn=%lx\n", - page->u.inuse.type_info, page->count_info, pfn ); - -#if 0 /* SYSV shared memory pages plus writeable files. */ - if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && - (page->u.inuse.type_info & PGT_count_mask) > 1 ) - { - printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n", - pfn, - page->u.inuse.type_info, - page->count_info ); - scan_for_pfn_remote(pfn); - } -#endif - if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && - (page->u.inuse.type_info & PGT_count_mask) > 1 ) - { - printk("normal page with type count >1: pfn=%lx t=%x c=%x\n", - pfn, - page->u.inuse.type_info, - page->count_info ); - } - - /* Use tlbflush_timestamp to store original type_info. */ - page->tlbflush_timestamp = page->u.inuse.type_info; - - list_ent = frame_table[pfn].list.next; - } - - - /* PHASE 1 */ - - adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1); - - list_ent = d->page_list.next; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - unsigned long *pt; - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - if ( page_get_owner(page) != d ) - BUG(); - - switch ( page->u.inuse.type_info & PGT_type_mask ) - { - case PGT_l2_page_table: - - if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) - printk("Audit %d: L2 not validated %x\n", - d->id, page->u.inuse.type_info); - - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) - printk("Audit %d: L2 not pinned %x\n", - d->id, page->u.inuse.type_info); - else - adjust( page, -1, 1 ); - - pt = map_domain_mem( pfn<>PAGE_SHIFT; - struct pfn_info *l1page = &frame_table[l1pfn]; - - if ( page_get_owner(l1page) != d ) - { - printk("L2: Skip bizarre page belonging to other " - "dom %p\n", page_get_owner(l1page)); - continue; - } - - if ( (l1page->u.inuse.type_info & PGT_type_mask) == - PGT_l2_page_table ) - printk("Audit %d: [%x] Found %s Linear PT " - "t=%x pfn=%lx\n", d->id, i, - (l1pfn==pfn) ? "Self" : "Other", - l1page->u.inuse.type_info, - l1pfn); - else if ( (l1page->u.inuse.type_info & PGT_type_mask) != - PGT_l1_page_table ) - printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n", - d->id, i, - l1page->u.inuse.type_info, - l1pfn); - - adjust(l1page, -1, 1); - } - } - - unmap_domain_mem(pt); - - break; - - - case PGT_l1_page_table: - - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) - adjust( page, -1, 1 ); - - if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) - printk("Audit %d: L1 not validated %x\n", - d->id, page->u.inuse.type_info); -#if 0 - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) - printk("Audit %d: L1 not pinned %x\n", - d->id, page->u.inuse.type_info); -#endif - pt = map_domain_mem( pfn<>PAGE_SHIFT; - struct pfn_info *l1page = &frame_table[l1pfn]; - - if ( l1pfn < 0x100 ) - { - lowmem_mappings++; - continue; - } - - if ( l1pfn > max_page ) - { - io_mappings++; - continue; - } - - if ( pt[i] & _PAGE_RW ) - { - - if ( (l1page->u.inuse.type_info & PGT_type_mask) == - PGT_l1_page_table || - (l1page->u.inuse.type_info & PGT_type_mask) == - PGT_l2_page_table ) - printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n", - d->id, i, - l1page->u.inuse.type_info, - l1pfn); - - } - - if ( page_get_owner(l1page) != d ) - { - printk("Audit %d: [%lx,%x] Skip foreign page dom=%p " - "pfn=%lx c=%08x t=%08x m2p=%lx\n", - d->id, pfn, i, - page_get_owner(l1page), - l1pfn, - l1page->count_info, - l1page->u.inuse.type_info, - machine_to_phys_mapping[l1pfn]); - continue; - } - - adjust(l1page, -1, 0); - } - } - - unmap_domain_mem(pt); - - break; - } - - list_ent = frame_table[pfn].list.next; - } - - if ( (io_mappings > 0) || (lowmem_mappings > 0) ) - printk("Audit %d: Found %d lowmem mappings and %d io mappings\n", - d->id, lowmem_mappings, io_mappings); - - /* PHASE 2 */ - - ctot = ttot = 0; - list_ent = d->page_list.next; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - switch ( page->u.inuse.type_info & PGT_type_mask) - { - case PGT_l1_page_table: - case PGT_l2_page_table: - if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) - { - printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n", - d->id, page->u.inuse.type_info, - page->tlbflush_timestamp, - page->count_info, pfn ); - scan_for_pfn_remote(pfn); - } - default: - if ( (page->count_info & PGC_count_mask) != 1 ) - { - printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n", - d->id, - page->count_info, - page->u.inuse.type_info, - page->tlbflush_timestamp, pfn ); - scan_for_pfn_remote(pfn); - } - break; - } - - list_ent = frame_table[pfn].list.next; - } - - /* PHASE 3 */ - list_ent = d->page_list.next; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - unsigned long *pt; - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - switch ( page->u.inuse.type_info & PGT_type_mask ) - { - case PGT_l2_page_table: - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) - adjust( page, 1, 1 ); - - pt = map_domain_mem( pfn<>PAGE_SHIFT; - struct pfn_info *l1page; - - if (l1pfn>max_page) - continue; - - l1page = &frame_table[l1pfn]; - - if ( page_get_owner(l1page) == d ) - adjust(l1page, 1, 1); - } - } - - unmap_domain_mem(pt); - break; - - case PGT_l1_page_table: - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) - adjust( page, 1, 1 ); - - pt = map_domain_mem( pfn<>PAGE_SHIFT; - struct pfn_info *l1page; - - if (l1pfn>max_page) - continue; - - l1page = &frame_table[l1pfn]; - - if ( (page_get_owner(l1page) != d) || - (l1pfn < 0x100) || (l1pfn > max_page) ) - continue; - - adjust(l1page, 1, 0); - } - } - - unmap_domain_mem(pt); - break; - } - - - page->tlbflush_timestamp = 0; - - list_ent = frame_table[pfn].list.next; - } - - spin_unlock(&d->page_alloc_lock); - - adjust(&frame_table[pagetable_val( - d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1); - - printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot ); - - if ( d != current->domain ) - domain_unpause(d); -} - -void audit_domains(void) -{ - struct domain *d; - for_each_domain ( d ) - audit_domain(d); -} - -void audit_domains_key(unsigned char key) -{ - audit_domains(); -} - -#endif diff -r ea98f0bb6510 -r 9f7935ea4606 xen/arch/x86/mm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm.c Tue Feb 08 15:13:51 2005 +0000 @@ -0,0 +1,2598 @@ +/* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */ +/****************************************************************************** + * arch/x86/mm.c + * + * Copyright (c) 2002-2005 K A Fraser + * Copyright (c) 2004 Christian Limpach + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * A description of the x86 page table API: + * + * Domains trap to do_mmu_update with a list of update requests. + * This is a list of (ptr, val) pairs, where the requested operation + * is *ptr = val. + * + * Reference counting of pages: + * ---------------------------- + * Each page has two refcounts: tot_count and type_count. + * + * TOT_COUNT is the obvious reference count. It counts all uses of a + * physical page frame by a domain, including uses as a page directory, + * a page table, or simple mappings via a PTE. This count prevents a + * domain from releasing a frame back to the free pool when it still holds + * a reference to it. + * + * TYPE_COUNT is more subtle. A frame can be put to one of three + * mutually-exclusive uses: it might be used as a page directory, or a + * page table, or it may be mapped writable by the domain [of course, a + * frame may not be used in any of these three ways!]. + * So, type_count is a count of the number of times a frame is being + * referred to in its current incarnation. Therefore, a page can only + * change its type when its type count is zero. + * + * Pinning the page type: + * ---------------------- + * The type of a page can be pinned/unpinned with the commands + * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, + * pinning is not reference counted, so it can't be nested). + * This is useful to prevent a page's type count falling to zero, at which + * point safety checks would need to be carried out next time the count + * is increased again. + * + * A further note on writable page mappings: + * ----------------------------------------- + * For simplicity, the count of writable mappings for a page may not + * correspond to reality. The 'writable count' is incremented for every + * PTE which maps the page with the _PAGE_RW flag set. However, for + * write access to be possible the page directory entry must also have + * its _PAGE_RW bit set. We do not check this as it complicates the + * reference counting considerably [consider the case of multiple + * directory entries referencing a single page table, some with the RW + * bit set, others not -- it starts getting a bit messy]. + * In normal use, this simplification shouldn't be a problem. + * However, the logic can be added if required. + * + * One more note on read-only page mappings: + * ----------------------------------------- + * We want domains to be able to map pages for read-only access. The + * main reason is that page tables and directories should be readable + * by a domain, but it would not be safe for them to be writable. + * However, domains have free access to rings 1 & 2 of the Intel + * privilege model. In terms of page protection, these are considered + * to be part of 'supervisor mode'. The WP bit in CR0 controls whether + * read-only restrictions are respected in supervisor mode -- if the + * bit is clear then any mapped page is writable. + * + * We get round this by always setting the WP bit and disallowing + * updates to it. This is very unlikely to cause a problem for guest + * OS's, which will generally use the WP bit to simplify copy-on-write + * implementation (in that case, OS wants a fault when it writes to + * an application-supplied buffer). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef VERBOSE +#define MEM_LOG(_f, _a...) \ + printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \ + current->domain->id , __LINE__ , ## _a ) +#else +#define MEM_LOG(_f, _a...) ((void)0) +#endif + +static int alloc_l2_table(struct pfn_info *page); +static int alloc_l1_table(struct pfn_info *page); +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d); +static int get_page_and_type_from_pagenr(unsigned long page_nr, + u32 type, + struct domain *d); + +static void free_l2_table(struct pfn_info *page); +static void free_l1_table(struct pfn_info *page); + +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); +static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); + +/* Used to defer flushing of memory structures. */ +static struct { +#define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */ +#define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */ + unsigned long deferred_ops; + /* If non-NULL, specifies a foreign subject domain for some operations. */ + struct domain *foreign; +} __cacheline_aligned percpu_info[NR_CPUS]; + +/* + * Returns the current foreign domain; defaults to the currently-executing + * domain if a foreign override hasn't been specified. + */ +#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain) + +/* Private domain structs for DOMID_XEN and DOMID_IO. */ +static struct domain *dom_xen, *dom_io; + +/* Frame table and its size in pages. */ +struct pfn_info *frame_table; +unsigned long frame_table_size; +unsigned long max_page; + +void __init init_frametable(void) +{ + unsigned long i, p; + + frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START; + frame_table_size = max_page * sizeof(struct pfn_info); + frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; + + for ( i = 0; i < frame_table_size; i += (4UL << 20) ) + { + p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20); + if ( p == 0 ) + panic("Not enough memory for frame table\n"); + map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, + 4UL << 20, PAGE_HYPERVISOR); + } + + memset(frame_table, 0, frame_table_size); +} + +void arch_init_memory(void) +{ + extern void subarch_init_memory(struct domain *); + + memset(percpu_info, 0, sizeof(percpu_info)); + + /* + * Initialise our DOMID_XEN domain. + * Any Xen-heap pages that we will allow to be mapped will have + * their domain field set to dom_xen. + */ + dom_xen = alloc_domain_struct(); + atomic_set(&dom_xen->refcnt, 1); + dom_xen->id = DOMID_XEN; + + /* + * Initialise our DOMID_IO domain. + * This domain owns no pages but is considered a special case when + * mapping I/O pages, as the mappings occur at the priv of the caller. + */ + dom_io = alloc_domain_struct(); + atomic_set(&dom_io->refcnt, 1); + dom_io->id = DOMID_IO; + + subarch_init_memory(dom_xen); +} + +void write_ptbase(struct exec_domain *ed) +{ + struct domain *d = ed->domain; + unsigned long pa; + +#ifdef CONFIG_VMX + if ( unlikely(shadow_mode(d)) ) + pa = ((shadow_mode(d) == SHM_full_32) ? + pagetable_val(ed->arch.monitor_table) : + pagetable_val(ed->arch.shadow_table)); + else + pa = pagetable_val(ed->arch.pagetable); +#else + if ( unlikely(shadow_mode(d)) ) + pa = pagetable_val(ed->arch.shadow_table); + else + pa = pagetable_val(ed->arch.pagetable); +#endif + + write_cr3(pa); +} + +static void __invalidate_shadow_ldt(struct exec_domain *d) +{ + int i; + unsigned long pfn; + struct pfn_info *page; + + d->arch.shadow_ldt_mapcnt = 0; + + for ( i = 16; i < 32; i++ ) + { + pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]); + if ( pfn == 0 ) continue; + d->arch.perdomain_ptes[i] = mk_l1_pgentry(0); + page = &frame_table[pfn]; + ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page); + ASSERT_PAGE_IS_DOMAIN(page, d->domain); + put_page_and_type(page); + } + + /* Dispose of the (now possibly invalid) mappings from the TLB. */ + percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT; +} + + +static inline void invalidate_shadow_ldt(struct exec_domain *d) +{ + if ( d->arch.shadow_ldt_mapcnt != 0 ) + __invalidate_shadow_ldt(d); +} + + +static int alloc_segdesc_page(struct pfn_info *page) +{ + struct desc_struct *descs; + int i; + + descs = map_domain_mem((page-frame_table) << PAGE_SHIFT); + + for ( i = 0; i < 512; i++ ) + if ( unlikely(!check_descriptor(&descs[i])) ) + goto fail; + + unmap_domain_mem(descs); + return 1; + + fail: + unmap_domain_mem(descs); + return 0; +} + + +/* Map shadow page at offset @off. */ +int map_ldt_shadow_page(unsigned int off) +{ + struct exec_domain *ed = current; + struct domain *d = ed->domain; + unsigned long l1e; + + if ( unlikely(in_irq()) ) + BUG(); + + __get_user(l1e, (unsigned long *) + &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]); + + if ( unlikely(!(l1e & _PAGE_PRESENT)) || + unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], + d, PGT_ldt_page)) ) + return 0; + + ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW); + ed->arch.shadow_ldt_mapcnt++; + + return 1; +} + + +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d) +{ + struct pfn_info *page = &frame_table[page_nr]; + + if ( unlikely(!pfn_is_ram(page_nr)) ) + { + MEM_LOG("Pfn %08lx is not RAM", page_nr); + return 0; + } + + if ( unlikely(!get_page(page, d)) ) + { + MEM_LOG("Could not get page ref for pfn %08lx", page_nr); + return 0; + } + + return 1; +} + + +static int get_page_and_type_from_pagenr(unsigned long page_nr, + u32 type, + struct domain *d) +{ + struct pfn_info *page = &frame_table[page_nr]; + + if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) + return 0; + + if ( unlikely(!get_page_type(page, type)) ) + { +#ifdef VERBOSE + if ( (type & PGT_type_mask) != PGT_l1_page_table ) + MEM_LOG("Bad page type for pfn %08lx (%08x)", + page_nr, page->u.inuse.type_info); +#endif + put_page(page); + return 0; + } + + return 1; +} + + +/* + * We allow an L2 tables to map each other (a.k.a. linear page tables). It + * needs some special care with reference counst and access permissions: + * 1. The mapping entry must be read-only, or the guest may get write access + * to its own PTEs. + * 2. We must only bump the reference counts for an *already validated* + * L2 table, or we can end up in a deadlock in get_page_type() by waiting + * on a validation that is required to complete that validation. + * 3. We only need to increment the reference counts for the mapped page + * frame if it is mapped by a different L2 table. This is sufficient and + * also necessary to allow validation of an L2 table mapping itself. + */ +static int +get_linear_pagetable( + l2_pgentry_t l2e, unsigned long pfn, struct domain *d) +{ + u32 x, y; + struct pfn_info *page; + + if ( (l2_pgentry_val(l2e) & _PAGE_RW) ) + { + MEM_LOG("Attempt to create linear p.t. with write perms"); + return 0; + } + + if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) + { + /* Make sure the mapped frame belongs to the correct domain. */ + if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) ) + return 0; + + /* + * Make sure that the mapped frame is an already-validated L2 table. + * If so, atomically increment the count (checking for overflow). + */ + page = &frame_table[l2_pgentry_to_pagenr(l2e)]; + y = page->u.inuse.type_info; + do { + x = y; + if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || + unlikely((x & (PGT_type_mask|PGT_validated)) != + (PGT_l2_page_table|PGT_validated)) ) + { + put_page(page); + return 0; + } + } + while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); + } + + return 1; +} + + +static int +get_page_from_l1e( + l1_pgentry_t l1e, struct domain *d) +{ + unsigned long l1v = l1_pgentry_val(l1e); + unsigned long pfn = l1_pgentry_to_pagenr(l1e); + struct pfn_info *page = &frame_table[pfn]; + extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn); + + if ( !(l1v & _PAGE_PRESENT) ) + return 1; + + if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) ) + { + MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT)); + return 0; + } + + if ( unlikely(!pfn_is_ram(pfn)) ) + { + /* Revert to caller privileges if FD == DOMID_IO. */ + if ( d == dom_io ) + d = current->domain; + + if ( IS_PRIV(d) ) + return 1; + + if ( IS_CAPABLE_PHYSDEV(d) ) + return domain_iomem_in_pfn(d, pfn); + + MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn); + return 0; + } + + return ((l1v & _PAGE_RW) ? + get_page_and_type(page, d, PGT_writable_page) : + get_page(page, d)); +} + + +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ +static int +get_page_from_l2e( + l2_pgentry_t l2e, unsigned long pfn, + struct domain *d, unsigned long va_idx) +{ + int rc; + + if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) + return 1; + + if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) ) + { + MEM_LOG("Bad L2 page type settings %04lx", + l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE)); + return 0; + } + + rc = get_page_and_type_from_pagenr( + l2_pgentry_to_pagenr(l2e), + PGT_l1_page_table | (va_idx<u.inuse.type_info & PGT_type_mask) == + PGT_ldt_page)) && + unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) ) + invalidate_shadow_ldt(e->exec_domain[0]); + put_page(page); + } +} + + +/* + * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. + * Note also that this automatically deals correctly with linear p.t.'s. + */ +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) +{ + if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && + ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) ) + put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]); +} + + +static int alloc_l2_table(struct pfn_info *page) +{ + struct domain *d = page_get_owner(page); + unsigned long page_nr = page_to_pfn(page); + l2_pgentry_t *pl2e; + int i; + + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); + + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) ) + goto fail; + +#if defined(__i386__) + /* Now we add our private high mappings. */ + memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); + pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | + __PAGE_HYPERVISOR); +#endif + + unmap_domain_mem(pl2e); + return 1; + + fail: + while ( i-- > 0 ) + put_page_from_l2e(pl2e[i], page_nr); + + unmap_domain_mem(pl2e); + return 0; +} + + +static int alloc_l1_table(struct pfn_info *page) +{ + struct domain *d = page_get_owner(page); + unsigned long page_nr = page_to_pfn(page); + l1_pgentry_t *pl1e; + int i; + + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); + + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) + if ( unlikely(!get_page_from_l1e(pl1e[i], d)) ) + goto fail; + + unmap_domain_mem(pl1e); + return 1; + + fail: + while ( i-- > 0 ) + put_page_from_l1e(pl1e[i], d); + + unmap_domain_mem(pl1e); + return 0; +} + + +static void free_l2_table(struct pfn_info *page) +{ + unsigned long page_nr = page - frame_table; + l2_pgentry_t *pl2e; + int i; + + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); + + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + put_page_from_l2e(pl2e[i], page_nr); + + unmap_domain_mem(pl2e); +} + + +static void free_l1_table(struct pfn_info *page) +{ + struct domain *d = page_get_owner(page); + unsigned long page_nr = page - frame_table; + l1_pgentry_t *pl1e; + int i; + + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); + + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) + put_page_from_l1e(pl1e[i], d); + + unmap_domain_mem(pl1e); +} + + +static inline int update_l2e(l2_pgentry_t *pl2e, + l2_pgentry_t ol2e, + l2_pgentry_t nl2e) +{ + unsigned long o = cmpxchg((unsigned long *)pl2e, + l2_pgentry_val(ol2e), + l2_pgentry_val(nl2e)); + if ( o != l2_pgentry_val(ol2e) ) + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", + l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o); + return (o == l2_pgentry_val(ol2e)); +} + + +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ +static int mod_l2_entry(l2_pgentry_t *pl2e, + l2_pgentry_t nl2e, + unsigned long pfn) +{ + l2_pgentry_t ol2e; + unsigned long _ol2e; + + if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >= + DOMAIN_ENTRIES_PER_L2_PAGETABLE) ) + { + MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e); + return 0; + } + + if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) ) + return 0; + ol2e = mk_l2_pgentry(_ol2e); + + if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT ) + { + /* Differ in mapping (bits 12-31) or presence (bit 0)? */ + if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 ) + return update_l2e(pl2e, ol2e, nl2e); + + if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, + ((unsigned long)pl2e & + ~PAGE_MASK) >> 2)) ) + return 0; + + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) + { + put_page_from_l2e(nl2e, pfn); + return 0; + } + + put_page_from_l2e(ol2e, pfn); + return 1; + } + + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) + return 0; + + put_page_from_l2e(ol2e, pfn); + return 1; +} + + +static inline int update_l1e(l1_pgentry_t *pl1e, + l1_pgentry_t ol1e, + l1_pgentry_t nl1e) +{ + unsigned long o = l1_pgentry_val(ol1e); + unsigned long n = l1_pgentry_val(nl1e); + + if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) || + unlikely(o != l1_pgentry_val(ol1e)) ) + { + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", + l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o); + return 0; + } + + return 1; +} + + +/* Update the L1 entry at pl1e to new value nl1e. */ +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) +{ + l1_pgentry_t ol1e; + unsigned long _ol1e; + struct domain *d = current->domain; + + if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) + { + MEM_LOG("Bad get_user\n"); + return 0; + } + + ol1e = mk_l1_pgentry(_ol1e); + + if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT ) + { + /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */ + if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 ) + return update_l1e(pl1e, ol1e, nl1e); + + if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) + return 0; + + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + { + put_page_from_l1e(nl1e, d); + return 0; + } + + put_page_from_l1e(ol1e, d); + return 1; + } + + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + return 0; + + put_page_from_l1e(ol1e, d); + return 1; +} + + +int alloc_page_type(struct pfn_info *page, unsigned int type) +{ + switch ( type ) + { + case PGT_l1_page_table: + return alloc_l1_table(page); + case PGT_l2_page_table: + return alloc_l2_table(page); + case PGT_gdt_page: + case PGT_ldt_page: + return alloc_segdesc_page(page); + default: + printk("Bad type in alloc_page_type %x t=%x c=%x\n", + type, page->u.inuse.type_info, + page->count_info); + BUG(); + } + + return 0; +} + + +void free_page_type(struct pfn_info *page, unsigned int type) +{ + struct domain *d = page_get_owner(page); + + switch ( type ) + { + case PGT_l1_page_table: + free_l1_table(page); + break; + + case PGT_l2_page_table: + free_l2_table(page); + break; + + default: + BUG(); + } + + if ( unlikely(shadow_mode(d)) && + (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) ) + { + unshadow_table(page_to_pfn(page), type); + put_shadow_status(d); + } +} + + +void put_page_type(struct pfn_info *page) +{ + u32 nx, x, y = page->u.inuse.type_info; + + again: + do { + x = y; + nx = x - 1; + + ASSERT((x & PGT_count_mask) != 0); + + /* + * The page should always be validated while a reference is held. The + * exception is during domain destruction, when we forcibly invalidate + * page-table pages if we detect a referential loop. + * See domain.c:relinquish_list(). + */ + ASSERT((x & PGT_validated) || + test_bit(DF_DYING, &page_get_owner(page)->d_flags)); + + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + /* Record TLB information for flush later. Races are harmless. */ + page->tlbflush_timestamp = tlbflush_current_time(); + + if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && + likely(nx & PGT_validated) ) + { + /* + * Page-table pages must be unvalidated when count is zero. The + * 'free' is safe because the refcnt is non-zero and validated + * bit is clear => other ops will spin or fail. + */ + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, + x & ~PGT_validated)) != x) ) + goto again; + /* We cleared the 'valid bit' so we do the clear up. */ + free_page_type(page, x & PGT_type_mask); + /* Carry on, but with the 'valid bit' now clear. */ + x &= ~PGT_validated; + nx &= ~PGT_validated; + } + } + else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == + (PGT_pinned | 1)) ) + { + /* Page is now only pinned. Make the back pointer mutable again. */ + nx |= PGT_va_mutable; + } + } + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); +} + + +int get_page_type(struct pfn_info *page, u32 type) +{ + u32 nx, x, y = page->u.inuse.type_info; + + again: + do { + x = y; + nx = x + 1; + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page)); + return 0; + } + else if ( unlikely((x & PGT_count_mask) == 0) ) + { + if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) + { + /* + * On type change we check to flush stale TLB entries. This + * may be unnecessary (e.g., page was GDT/LDT) but those + * circumstances should be very rare. + */ + struct domain *d = page_get_owner(page); + if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor], + page->tlbflush_timestamp)) ) + { + perfc_incr(need_flush_tlb_flush); + flush_tlb_cpu(d->exec_domain[0]->processor); + } + + /* We lose existing type, back pointer, and validity. */ + nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); + nx |= type; + + /* No special validation needed for writable pages. */ + /* Page tables and GDT/LDT need to be scanned for validity. */ + if ( type == PGT_writable_page ) + nx |= PGT_validated; + } + } + else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) + { + if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) + { + if ( ((x & PGT_type_mask) != PGT_l2_page_table) || + ((type & PGT_type_mask) != PGT_l1_page_table) ) + MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n", + x & PGT_type_mask, type, page_to_pfn(page)); + return 0; + } + else if ( (x & PGT_va_mask) == PGT_va_mutable ) + { + /* The va backpointer is mutable, hence we update it. */ + nx &= ~PGT_va_mask; + nx |= type; /* we know the actual type is correct */ + } + else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) ) + { + /* This table is potentially mapped at multiple locations. */ + nx &= ~PGT_va_mask; + nx |= PGT_va_unknown; + } + } + else if ( unlikely(!(x & PGT_validated)) ) + { + /* Someone else is updating validation of this page. Wait... */ + while ( (y = page->u.inuse.type_info) == x ) + { + rep_nop(); + barrier(); + } + goto again; + } + } + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); + + if ( unlikely(!(nx & PGT_validated)) ) + { + /* Try to validate page type; drop the new reference on failure. */ + if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) + { + MEM_LOG("Error while validating pfn %08lx for type %08x." + " caf=%08x taf=%08x\n", + page_to_pfn(page), type, + page->count_info, + page->u.inuse.type_info); + /* Noone else can get a reference. We hold the only ref. */ + page->u.inuse.type_info = 0; + return 0; + } + + /* Noone else is updating simultaneously. */ + __set_bit(_PGT_validated, &page->u.inuse.type_info); + } + + return 1; +} + + +int new_guest_cr3(unsigned long pfn) +{ + struct exec_domain *ed = current; + struct domain *d = ed->domain; + int okay, cpu = smp_processor_id(); + unsigned long old_base_pfn; + + okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d); + if ( likely(okay) ) + { + invalidate_shadow_ldt(ed); + + percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB; + old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT; + ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT); + + shadow_mk_pagetable(ed); + + write_ptbase(ed); + + put_page_and_type(&frame_table[old_base_pfn]); + } + else + { + MEM_LOG("Error while installing new baseptr %08lx", pfn); + } + + return okay; +} + +static int do_extended_command(unsigned long ptr, unsigned long val) +{ + int okay = 1, cpu = smp_processor_id(); + unsigned int cmd = val & MMUEXT_CMD_MASK; + unsigned long pfn = ptr >> PAGE_SHIFT; + struct pfn_info *page = &frame_table[pfn]; + struct exec_domain *ed = current; + struct domain *d = ed->domain, *nd, *e; + u32 x, y; + domid_t domid; + grant_ref_t gntref; + + switch ( cmd ) + { + case MMUEXT_PIN_L1_TABLE: + case MMUEXT_PIN_L2_TABLE: + /* + * We insist that, if you pin an L1 page, it's the first thing that + * you do to it. This is because we require the backptr to still be + * mutable. This assumption seems safe. + */ + okay = get_page_and_type_from_pagenr( + pfn, + ((cmd==MMUEXT_PIN_L2_TABLE) ? + PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)), + FOREIGNDOM); + + if ( unlikely(!okay) ) + { + MEM_LOG("Error while pinning pfn %08lx", pfn); + break; + } + + if ( unlikely(test_and_set_bit(_PGT_pinned, + &page->u.inuse.type_info)) ) + { + MEM_LOG("Pfn %08lx already pinned", pfn); + put_page_and_type(page); + okay = 0; + break; + } + + break; + + case MMUEXT_UNPIN_TABLE: + if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) ) + { + MEM_LOG("Page %08lx bad domain (dom=%p)", + ptr, page_get_owner(page)); + } + else if ( likely(test_and_clear_bit(_PGT_pinned, + &page->u.inuse.type_info)) ) + { + put_page_and_type(page); + put_page(page); + } + else + { + okay = 0; + put_page(page); + MEM_LOG("Pfn %08lx not pinned", pfn); + } + break; + + case MMUEXT_NEW_BASEPTR: + okay = new_guest_cr3(pfn); + break; + + case MMUEXT_TLB_FLUSH: + percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB; + break; + + case MMUEXT_INVLPG: + __flush_tlb_one(ptr); + break; + + case MMUEXT_FLUSH_CACHE: + if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) ) + { + MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n"); + okay = 0; + } + else + { + wbinvd(); + } + break; + + case MMUEXT_SET_LDT: + { + unsigned long ents = val >> MMUEXT_CMD_SHIFT; + if ( ((ptr & (PAGE_SIZE-1)) != 0) || + (ents > 8192) || + ((ptr+ents*LDT_ENTRY_SIZE) < ptr) || + ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) ) + { + okay = 0; + MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents); + } + else if ( (ed->arch.ldt_ents != ents) || + (ed->arch.ldt_base != ptr) ) + { + invalidate_shadow_ldt(ed); + ed->arch.ldt_base = ptr; + ed->arch.ldt_ents = ents; + load_LDT(ed); + percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT; + if ( ents != 0 ) + percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT; + } + break; + } + + case MMUEXT_SET_FOREIGNDOM: + domid = (domid_t)(val >> 16); + + if ( (e = percpu_info[cpu].foreign) != NULL ) + put_domain(e); + percpu_info[cpu].foreign = NULL; + + if ( !IS_PRIV(d) ) + { + switch ( domid ) + { + case DOMID_IO: + get_knownalive_domain(dom_io); + percpu_info[cpu].foreign = dom_io; + break; + default: + MEM_LOG("Dom %u cannot set foreign dom\n", d->id); + okay = 0; + break; + } + } + else + { + percpu_info[cpu].foreign = e = find_domain_by_id(domid); + if ( e == NULL ) + { + switch ( domid ) + { + case DOMID_XEN: + get_knownalive_domain(dom_xen); + percpu_info[cpu].foreign = dom_xen; + break; + case DOMID_IO: + get_knownalive_domain(dom_io); + percpu_info[cpu].foreign = dom_io; + break; + default: + MEM_LOG("Unknown domain '%u'", domid); + okay = 0; + break; + } + } + } + break; + + case MMUEXT_TRANSFER_PAGE: + domid = (domid_t)(val >> 16); + gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF)); + + if ( unlikely(IS_XEN_HEAP_FRAME(page)) || + unlikely(!pfn_is_ram(pfn)) || + unlikely((e = find_domain_by_id(domid)) == NULL) ) + { + MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid); + okay = 0; + break; + } + + spin_lock(&d->page_alloc_lock); + + /* + * The tricky bit: atomically release ownership while there is just one + * benign reference to the page (PGC_allocated). If that reference + * disappears then the deallocation routine will safely spin. + */ + nd = page_get_owner(page); + y = page->count_info; + do { + x = y; + if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != + (1|PGC_allocated)) || + unlikely(nd != d) ) + { + MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," + " caf=%08x, taf=%08x\n", page_to_pfn(page), + d, d->id, nd, x, page->u.inuse.type_info); + spin_unlock(&d->page_alloc_lock); + put_domain(e); + return 0; + } + __asm__ __volatile__( + LOCK_PREFIX "cmpxchg8b %2" + : "=d" (nd), "=a" (y), + "=m" (*(volatile u64 *)(&page->count_info)) + : "0" (d), "1" (x), "c" (NULL), "b" (x) ); + } + while ( unlikely(nd != d) || unlikely(y != x) ); + + /* + * Unlink from 'd'. At least one reference remains (now anonymous), so + * noone else is spinning to try to delete this page from 'd'. + */ + d->tot_pages--; + list_del(&page->list); + + spin_unlock(&d->page_alloc_lock); + + spin_lock(&e->page_alloc_lock); + + /* + * Check that 'e' will accept the page and has reservation headroom. + * Also, a domain mustn't have PGC_allocated pages when it is dying. + */ + ASSERT(e->tot_pages <= e->max_pages); + if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || + unlikely(e->tot_pages == e->max_pages) || + unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) ) + { + MEM_LOG("Transferee has no reservation headroom (%d,%d), or " + "provided a bad grant ref, or is dying (%08lx).\n", + e->tot_pages, e->max_pages, e->d_flags); + spin_unlock(&e->page_alloc_lock); + put_domain(e); + okay = 0; + break; + } + + /* Okay, add the page to 'e'. */ + if ( unlikely(e->tot_pages++ == 0) ) + get_knownalive_domain(e); + list_add_tail(&page->list, &e->page_list); + page_set_owner(page, e); + + spin_unlock(&e->page_alloc_lock); + + /* Transfer is all done: tell the guest about its new page frame. */ + gnttab_notify_transfer(e, gntref, pfn); + + put_domain(e); + break; + + case MMUEXT_REASSIGN_PAGE: + if ( unlikely(!IS_PRIV(d)) ) + { + MEM_LOG("Dom %u has no reassignment priv", d->id); + okay = 0; + break; + } + + e = percpu_info[cpu].foreign; + if ( unlikely(e == NULL) ) + { + MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn); + okay = 0; + break; + } + + /* + * Grab both page_list locks, in order. This prevents the page from + * disappearing elsewhere while we modify the owner, and we'll need + * both locks if we're successful so that we can change lists. + */ + if ( d < e ) + { + spin_lock(&d->page_alloc_lock); + spin_lock(&e->page_alloc_lock); + } + else + { + spin_lock(&e->page_alloc_lock); + spin_lock(&d->page_alloc_lock); + } + + /* A domain shouldn't have PGC_allocated pages when it is dying. */ + if ( unlikely(test_bit(DF_DYING, &e->d_flags)) || + unlikely(IS_XEN_HEAP_FRAME(page)) ) + { + MEM_LOG("Reassignment page is Xen heap, or dest dom is dying."); + okay = 0; + goto reassign_fail; + } + + /* + * The tricky bit: atomically change owner while there is just one + * benign reference to the page (PGC_allocated). If that reference + * disappears then the deallocation routine will safely spin. + */ + nd = page_get_owner(page); + y = page->count_info; + do { + x = y; + if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != + (1|PGC_allocated)) || + unlikely(nd != d) ) + { + MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p," + " caf=%08x, taf=%08x\n", page_to_pfn(page), + d, d->id, nd, x, page->u.inuse.type_info); + okay = 0; + goto reassign_fail; + } + __asm__ __volatile__( + LOCK_PREFIX "cmpxchg8b %3" + : "=d" (nd), "=a" (y), "=c" (e), + "=m" (*(volatile u64 *)(&page->count_info)) + : "0" (d), "1" (x), "c" (e), "b" (x) ); + } + while ( unlikely(nd != d) || unlikely(y != x) ); + + /* + * Unlink from 'd'. We transferred at least one reference to 'e', so + * noone else is spinning to try to delete this page from 'd'. + */ + d->tot_pages--; + list_del(&page->list); + + /* + * Add the page to 'e'. Someone may already have removed the last + * reference and want to remove the page from 'e'. However, we have + * the lock so they'll spin waiting for us. + */ + if ( unlikely(e->tot_pages++ == 0) ) + get_knownalive_domain(e); + list_add_tail(&page->list, &e->page_list); + + reassign_fail: + spin_unlock(&d->page_alloc_lock); + spin_unlock(&e->page_alloc_lock); + break; + + case MMUEXT_CLEAR_FOREIGNDOM: + if ( (e = percpu_info[cpu].foreign) != NULL ) + put_domain(e); + percpu_info[cpu].foreign = NULL; + break; + + default: + MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK); + okay = 0; + break; + } + + return okay; +} + +int do_mmu_update( + mmu_update_t *ureqs, unsigned int count, unsigned int *pdone) +{ +/* + * We steal the m.s.b. of the @count parameter to indicate whether this + * invocation of do_mmu_update() is resuming a previously preempted call. + * We steal the next 15 bits to remember the current FOREIGNDOM. + */ +#define MMU_UPDATE_PREEMPTED (~(~0U>>1)) +#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16) +#define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<domain; + u32 type_info; + domid_t domid; + + LOCK_BIGLOCK(d); + + cleanup_writable_pagetable(d); + + if ( unlikely(shadow_mode(d)) ) + check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */ + + /* + * If we are resuming after preemption, read how much work we have already + * done. This allows us to set the @done output parameter correctly. + * We also reset FOREIGNDOM here. + */ + if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) ) + { + if ( !(count & MMU_UPDATE_PREEMPTED) ) + { + /* Count overflow into private FOREIGNDOM field. */ + MEM_LOG("do_mmu_update count is too large"); + rc = -EINVAL; + goto out; + } + count &= ~MMU_UPDATE_PREEMPTED; + domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT; + count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK; + if ( unlikely(pdone != NULL) ) + (void)get_user(done, pdone); + if ( (domid != current->domain->id) && + !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) ) + { + rc = -EINVAL; + goto out; + } + } + + perfc_incrc(calls_to_mmu_update); + perfc_addc(num_page_updates, count); + + if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) ) + { + rc = -EFAULT; + goto out; + } + + for ( i = 0; i < count; i++ ) + { + if ( hypercall_preempt_check() ) + { + rc = hypercall3_create_continuation( + __HYPERVISOR_mmu_update, ureqs, + (count - i) | + (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | + MMU_UPDATE_PREEMPTED, pdone); + break; + } + + if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) ) + { + MEM_LOG("Bad __copy_from_user"); + rc = -EFAULT; + break; + } + + cmd = req.ptr & (sizeof(l1_pgentry_t)-1); + pfn = req.ptr >> PAGE_SHIFT; + + okay = 0; + + switch ( cmd ) + { + /* + * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. + */ + case MMU_NORMAL_PT_UPDATE: + if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) ) + { + MEM_LOG("Could not get page for normal update"); + break; + } + + if ( likely(prev_pfn == pfn) ) + { + va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); + } + else + { + if ( prev_pfn != 0 ) + unmap_domain_mem((void *)va); + va = (unsigned long)map_domain_mem(req.ptr); + prev_pfn = pfn; + } + + page = &frame_table[pfn]; + switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) + { + case PGT_l1_page_table: + if ( likely(get_page_type( + page, type_info & (PGT_type_mask|PGT_va_mask))) ) + { + okay = mod_l1_entry((l1_pgentry_t *)va, + mk_l1_pgentry(req.val)); + + if ( unlikely(shadow_mode(d)) && okay && + (get_shadow_status(d, page-frame_table) & + PSH_shadowed) ) + { + shadow_l1_normal_pt_update( + req.ptr, req.val, &prev_smfn, &prev_spl1e); + put_shadow_status(d); + } + + put_page_type(page); + } + break; + case PGT_l2_page_table: + if ( likely(get_page_type(page, PGT_l2_page_table)) ) + { + okay = mod_l2_entry((l2_pgentry_t *)va, + mk_l2_pgentry(req.val), + pfn); + + if ( unlikely(shadow_mode(d)) && okay && + (get_shadow_status(d, page-frame_table) & + PSH_shadowed) ) + { + shadow_l2_normal_pt_update(req.ptr, req.val); + put_shadow_status(d); + } + + put_page_type(page); + } + break; + default: + if ( likely(get_page_type(page, PGT_writable_page)) ) + { + *(unsigned long *)va = req.val; + okay = 1; + put_page_type(page); + } + break; + } + + put_page(page); + break; + + case MMU_MACHPHYS_UPDATE: + if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) ) + { + MEM_LOG("Could not get page for mach->phys update"); + break; + } + + machine_to_phys_mapping[pfn] = req.val; + okay = 1; + + /* + * If in log-dirty mode, mark the corresponding pseudo-physical + * page as dirty. + */ + if ( unlikely(shadow_mode(d) == SHM_logdirty) && + mark_dirty(d, pfn) ) + d->arch.shadow_dirty_block_count++; + + put_page(&frame_table[pfn]); + break; + + /* + * MMU_EXTENDED_COMMAND: Extended command is specified + * in the least-siginificant bits of the 'value' field. + */ + case MMU_EXTENDED_COMMAND: + req.ptr &= ~(sizeof(l1_pgentry_t) - 1); + okay = do_extended_command(req.ptr, req.val); + break; + + default: + MEM_LOG("Invalid page update command %08lx", req.ptr); + break; + } + + if ( unlikely(!okay) ) + { + rc = -EINVAL; + break; + } + + ureqs++; + } + + out: + if ( prev_pfn != 0 ) + unmap_domain_mem((void *)va); + + if ( unlikely(prev_spl1e != 0) ) + unmap_domain_mem((void *)prev_spl1e); + + deferred_ops = percpu_info[cpu].deferred_ops; + percpu_info[cpu].deferred_ops = 0; + + if ( deferred_ops & DOP_FLUSH_TLB ) + local_flush_tlb(); + + if ( deferred_ops & DOP_RELOAD_LDT ) + (void)map_ldt_shadow_page(0); + + if ( unlikely(percpu_info[cpu].foreign != NULL) ) + { + put_domain(percpu_info[cpu].foreign); + percpu_info[cpu].foreign = NULL; + } + + /* Add incremental work we have done to the @done output parameter. */ + if ( unlikely(pdone != NULL) ) + __put_user(done + i, pdone); + + if ( unlikely(shadow_mode(d)) ) + check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */ + + UNLOCK_BIGLOCK(d); + return rc; +} + + +int do_update_va_mapping(unsigned long va, + unsigned long val, + unsigned long flags) +{ + struct exec_domain *ed = current; + struct domain *d = ed->domain; + int err = 0; + unsigned int cpu = ed->processor; + unsigned long deferred_ops; + + perfc_incrc(calls_to_update_va); + + if ( unlikely(!__addr_ok(va)) ) + return -EINVAL; + + LOCK_BIGLOCK(d); + + cleanup_writable_pagetable(d); + + /* + * XXX When we make this support 4MB superpages we should also deal with + * the case of updating L2 entries. + */ + + if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)], + mk_l1_pgentry(val))) ) + err = -EINVAL; + + if ( unlikely(shadow_mode(d)) ) + { + unsigned long sval = 0; + + l1pte_propagate_from_guest(d, &val, &sval); + + if ( unlikely(__put_user(sval, ((unsigned long *)( + &shadow_linear_pg_table[l1_linear_offset(va)])))) ) + { + /* + * Since L2's are guranteed RW, failure indicates the page was not + * shadowed, so ignore. + */ + perfc_incrc(shadow_update_va_fail); + } + + /* + * If we're in log-dirty mode then we need to note that we've updated + * the PTE in the PT-holding page. We need the machine frame number + * for this. + */ + if ( shadow_mode(d) == SHM_logdirty ) + mark_dirty(d, va_to_l1mfn(va)); + + check_pagetable(d, ed->arch.pagetable, "va"); /* debug */ + } + + deferred_ops = percpu_info[cpu].deferred_ops; + percpu_info[cpu].deferred_ops = 0; + + if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || + unlikely(flags & UVMF_FLUSH_TLB) ) + local_flush_tlb(); + else if ( unlikely(flags & UVMF_INVLPG) ) + __flush_tlb_one(va); + + if ( unlikely(deferred_ops & DOP_RELOAD_LDT) ) + (void)map_ldt_shadow_page(0); + + UNLOCK_BIGLOCK(d); + + return err; +} + +int do_update_va_mapping_otherdomain(unsigned long va, + unsigned long val, + unsigned long flags, + domid_t domid) +{ + unsigned int cpu = smp_processor_id(); + struct domain *d; + int rc; + + if ( unlikely(!IS_PRIV(current->domain)) ) + return -EPERM; + + percpu_info[cpu].foreign = d = find_domain_by_id(domid); + if ( unlikely(d == NULL) ) + { + MEM_LOG("Unknown domain '%u'", domid); + return -ESRCH; + } + + rc = do_update_va_mapping(va, val, flags); + + put_domain(d); + percpu_info[cpu].foreign = NULL; + + return rc; +} + + + +/************************* + * Descriptor Tables + */ + +void destroy_gdt(struct exec_domain *ed) +{ + int i; + unsigned long pfn; + + for ( i = 0; i < 16; i++ ) + { + if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 ) + put_page_and_type(&frame_table[pfn]); + ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0); + } +} + + +long set_gdt(struct exec_domain *ed, + unsigned long *frames, + unsigned int entries) +{ + struct domain *d = ed->domain; + /* NB. There are 512 8-byte entries per GDT page. */ + int i = 0, nr_pages = (entries + 511) / 512; + struct desc_struct *vgdt; + unsigned long pfn; + + /* Check the first page in the new GDT. */ + if ( (pfn = frames[0]) >= max_page ) + goto fail; + + /* The first page is special because Xen owns a range of entries in it. */ + if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) + { + /* GDT checks failed: try zapping the Xen reserved entries. */ + if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) ) + goto fail; + vgdt = map_domain_mem(pfn << PAGE_SHIFT); + memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0, + NR_RESERVED_GDT_ENTRIES*8); + unmap_domain_mem(vgdt); + put_page_and_type(&frame_table[pfn]); + + /* Okay, we zapped the entries. Now try the GDT checks again. */ + if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) + goto fail; + } + + /* Check the remaining pages in the new GDT. */ + for ( i = 1; i < nr_pages; i++ ) + if ( ((pfn = frames[i]) >= max_page) || + !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) + goto fail; + + /* Copy reserved GDT entries to the new GDT. */ + vgdt = map_domain_mem(frames[0] << PAGE_SHIFT); + memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, + gdt_table + FIRST_RESERVED_GDT_ENTRY, + NR_RESERVED_GDT_ENTRIES*8); + unmap_domain_mem(vgdt); + + /* Tear down the old GDT. */ + destroy_gdt(ed); + + /* Install the new GDT. */ + for ( i = 0; i < nr_pages; i++ ) + ed->arch.perdomain_ptes[i] = + mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); + + SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed)); + SET_GDT_ENTRIES(ed, entries); + + return 0; + + fail: + while ( i-- > 0 ) + put_page_and_type(&frame_table[frames[i]]); + return -EINVAL; +} + + +long do_set_gdt(unsigned long *frame_list, unsigned int entries) +{ + int nr_pages = (entries + 511) / 512; + unsigned long frames[16]; + long ret; + + if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) + return -EINVAL; + + if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) ) + return -EFAULT; + + LOCK_BIGLOCK(current->domain); + + if ( (ret = set_gdt(current, frames, entries)) == 0 ) + { + local_flush_tlb(); + __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt)); + } + + UNLOCK_BIGLOCK(current->domain); + + return ret; +} + + +long do_update_descriptor( + unsigned long pa, unsigned long word1, unsigned long word2) +{ + unsigned long pfn = pa >> PAGE_SHIFT; + struct desc_struct *gdt_pent, d; + struct pfn_info *page; + struct exec_domain *ed; + long ret = -EINVAL; + + d.a = (u32)word1; + d.b = (u32)word2; + + LOCK_BIGLOCK(current->domain); + + if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) { + UNLOCK_BIGLOCK(current->domain); + return -EINVAL; + } + + page = &frame_table[pfn]; + if ( unlikely(!get_page(page, current->domain)) ) { + UNLOCK_BIGLOCK(current->domain); + return -EINVAL; + } + + /* Check if the given frame is in use in an unsafe context. */ + switch ( page->u.inuse.type_info & PGT_type_mask ) + { + case PGT_gdt_page: + /* Disallow updates of Xen-reserved descriptors in the current GDT. */ + for_each_exec_domain(current->domain, ed) { + if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) && + (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && + (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) + goto out; + } + if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) + goto out; + break; + case PGT_ldt_page: + if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) + goto out; + break; + default: + if ( unlikely(!get_page_type(page, PGT_writable_page)) ) + goto out; + break; + } + + /* All is good so make the update. */ + gdt_pent = map_domain_mem(pa); + memcpy(gdt_pent, &d, 8); + unmap_domain_mem(gdt_pent); + + put_page_type(page); + + ret = 0; /* success */ + + out: + put_page(page); + + UNLOCK_BIGLOCK(current->domain); + + return ret; +} + + + +/************************* + * Writable Pagetables + */ + +ptwr_info_t ptwr_info[NR_CPUS]; + +#ifdef VERBOSE +int ptwr_debug = 0x0; +#define PTWR_PRINTK(_f, _a...) \ + do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 ) +#define PTWR_PRINT_WHICH (which ? 'I' : 'A') +#else +#define PTWR_PRINTK(_f, _a...) ((void)0) +#endif + +/* Flush the given writable p.t. page and write-protect it again. */ +void ptwr_flush(const int which) +{ + unsigned long sstat, spte, pte, *ptep, l1va; + l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e; + l2_pgentry_t *pl2e; + int i, cpu = smp_processor_id(); + struct exec_domain *ed = current; + struct domain *d = ed->domain; + + l1va = ptwr_info[cpu].ptinfo[which].l1va; + ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT]; + + /* + * STEP 1. Write-protect the p.t. page so no more updates can occur. + */ + + if ( unlikely(__get_user(pte, ptep)) ) + { + MEM_LOG("ptwr: Could not read pte at %p\n", ptep); + /* + * Really a bug. We could read this PTE during the initial fault, + * and pagetables can't have changed meantime. XXX Multi-CPU guests? + */ + BUG(); + } + PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n", + PTWR_PRINT_WHICH, ptep, pte); + pte &= ~_PAGE_RW; + + if ( unlikely(shadow_mode(d)) ) + { + /* Write-protect the p.t. page in the shadow page table. */ + l1pte_propagate_from_guest(d, &pte, &spte); + __put_user( + spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]); + + /* Is the p.t. page itself shadowed? Map it into Xen space if so. */ + sstat = get_shadow_status(d, pte >> PAGE_SHIFT); + if ( sstat & PSH_shadowed ) + sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT); + } + + /* Write-protect the p.t. page in the guest page table. */ + if ( unlikely(__put_user(pte, ptep)) ) + { + MEM_LOG("ptwr: Could not update pte at %p\n", ptep); + /* + * Really a bug. We could write this PTE during the initial fault, + * and pagetables can't have changed meantime. XXX Multi-CPU guests? + */ + BUG(); + } + + /* Ensure that there are no stale writable mappings in any TLB. */ + /* NB. INVLPG is a serialising instruction: flushes pending updates. */ +#if 1 + __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */ +#else + flush_tlb_all(); +#endif + PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n", + PTWR_PRINT_WHICH, ptep, pte); + + /* + * STEP 2. Validate any modified PTEs. + */ + + pl1e = ptwr_info[cpu].ptinfo[which].pl1e; + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) + { + ol1e = ptwr_info[cpu].ptinfo[which].page[i]; + nl1e = pl1e[i]; + + if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) ) + continue; + + /* + * Fast path for PTEs that have merely been write-protected + * (e.g., during a Unix fork()). A strict reduction in privilege. + */ + if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) ) + { + if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) ) + { + if ( unlikely(sl1e != NULL) ) + l1pte_propagate_from_guest( + d, &l1_pgentry_val(nl1e), + &l1_pgentry_val(sl1e[i])); + put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]); + } + continue; + } + + if ( unlikely(!get_page_from_l1e(nl1e, d)) ) + { + MEM_LOG("ptwr: Could not re-validate l1 page\n"); + /* + * Make the remaining p.t's consistent before crashing, so the + * reference counts are correct. + */ + memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i], + (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t)); + unmap_domain_mem(pl1e); + ptwr_info[cpu].ptinfo[which].l1va = 0; + UNLOCK_BIGLOCK(d); + domain_crash(); + } + + if ( unlikely(sl1e != NULL) ) + l1pte_propagate_from_guest( + d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i])); + + if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) ) + put_page_from_l1e(ol1e, d); + } + unmap_domain_mem(pl1e); + + /* + * STEP 3. Reattach the L1 p.t. page into the current address space. + */ + + if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) ) + { + pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx]; + *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); + } + + /* + * STEP 4. Final tidy-up. + */ + + ptwr_info[cpu].ptinfo[which].l1va = 0; + + if ( unlikely(sl1e != NULL) ) + { + unmap_domain_mem(sl1e); + put_shadow_status(d); + } +} + +/* Write page fault handler: check if guest is trying to modify a PTE. */ +int ptwr_do_page_fault(unsigned long addr) +{ + unsigned long pte, pfn, l2e; + struct pfn_info *page; + l2_pgentry_t *pl2e; + int which, cpu = smp_processor_id(); + u32 l2_idx; + +#ifdef __x86_64__ + return 0; /* Writable pagetables need fixing for x86_64. */ +#endif + + /* + * Attempt to read the PTE that maps the VA being accessed. By checking for + * PDE validity in the L2 we avoid many expensive fixups in __get_user(). + */ + if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) & + _PAGE_PRESENT) || + __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) ) + { + return 0; + } + + pfn = pte >> PAGE_SHIFT; + page = &frame_table[pfn]; + + /* We are looking only for read-only mappings of p.t. pages. */ + if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) || + ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ) + { + return 0; + } + + /* Get the L2 index at which this L1 p.t. is always mapped. */ + l2_idx = page->u.inuse.type_info & PGT_va_mask; + if ( unlikely(l2_idx >= PGT_va_unknown) ) + { + domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */ + } + l2_idx >>= PGT_va_shift; + + if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) ) + { + MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr); + domain_crash(); + } + + /* + * Is the L1 p.t. mapped into the current address space? If so we call it + * an ACTIVE p.t., otherwise it is INACTIVE. + */ + pl2e = &linear_l2_table[l2_idx]; + l2e = l2_pgentry_val(*pl2e); + which = PTWR_PT_INACTIVE; + if ( (l2e >> PAGE_SHIFT) == pfn ) + { + /* Check the PRESENT bit to set ACTIVE. */ + if ( likely(l2e & _PAGE_PRESENT) ) + which = PTWR_PT_ACTIVE; + else { + /* + * If the PRESENT bit is clear, we may be conflicting with + * the current ACTIVE p.t. (it may be the same p.t. mapped + * at another virt addr). + * The ptwr_flush call below will restore the PRESENT bit. + */ + if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va && + l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx ) + which = PTWR_PT_ACTIVE; + } + } + + PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, " + "pfn %08lx\n", PTWR_PRINT_WHICH, + addr, l2_idx << L2_PAGETABLE_SHIFT, pfn); + + /* + * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at + * time. If there is already one, we must flush it out. + */ + if ( ptwr_info[cpu].ptinfo[which].l1va ) + ptwr_flush(which); + + ptwr_info[cpu].ptinfo[which].l1va = addr | 1; + ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx; + + /* For safety, disconnect the L1 p.t. page from current space. */ + if ( (which == PTWR_PT_ACTIVE) && + likely(!shadow_mode(current->domain)) ) + { + *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT); +#if 1 + flush_tlb(); /* XXX Multi-CPU guests? */ +#else + flush_tlb_all(); +#endif + } + + /* Temporarily map the L1 page, and make a copy of it. */ + ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT); + memcpy(ptwr_info[cpu].ptinfo[which].page, + ptwr_info[cpu].ptinfo[which].pl1e, + ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t)); + + /* Finally, make the p.t. page writable by the guest OS. */ + pte |= _PAGE_RW; + PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH, + &linear_pg_table[addr>>PAGE_SHIFT], pte); + if ( unlikely(__put_user(pte, (unsigned long *) + &linear_pg_table[addr>>PAGE_SHIFT])) ) + { + MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *) + &linear_pg_table[addr>>PAGE_SHIFT]); + /* Toss the writable pagetable state and crash. */ + unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e); + ptwr_info[cpu].ptinfo[which].l1va = 0; + domain_crash(); + } + + return EXCRET_fault_fixed; +} + +static __init int ptwr_init(void) +{ + int i; + + for ( i = 0; i < smp_num_cpus; i++ ) + { + ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page = + (void *)alloc_xenheap_page(); + ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page = + (void *)alloc_xenheap_page(); + } + + return 0; +} +__initcall(ptwr_init); + + + + +/************************************************************************/ +/************************************************************************/ +/************************************************************************/ + +#ifndef NDEBUG + +void ptwr_status(void) +{ + unsigned long pte, *ptep, pfn; + struct pfn_info *page; + int cpu = smp_processor_id(); + + ptep = (unsigned long *)&linear_pg_table + [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT]; + + if ( __get_user(pte, ptep) ) { + MEM_LOG("ptwr: Could not read pte at %p\n", ptep); + domain_crash(); + } + + pfn = pte >> PAGE_SHIFT; + page = &frame_table[pfn]; + printk("need to alloc l1 page %p\n", page); + /* make pt page writable */ + printk("need to make read-only l1-page at %p is %08lx\n", + ptep, pte); + + if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 ) + return; + + if ( __get_user(pte, (unsigned long *) + ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) { + MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *) + ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va); + domain_crash(); + } + pfn = pte >> PAGE_SHIFT; + page = &frame_table[pfn]; +} + +void audit_domain(struct domain *d) +{ + int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; + + void adjust (struct pfn_info *page, int dir, int adjtype) + { + int count = page->count_info & PGC_count_mask; + + if ( adjtype ) + { + int tcount = page->u.inuse.type_info & PGT_count_mask; + + ttot++; + + tcount += dir; + + if ( tcount < 0 ) + { + /* This will only come out once. */ + printk("Audit %d: type count whent below zero pfn=%x " + "taf=%x otaf=%x\n", + d->id, page-frame_table, + page->u.inuse.type_info, + page->tlbflush_timestamp); + } + + page->u.inuse.type_info = + (page->u.inuse.type_info & ~PGT_count_mask) | + (tcount & PGT_count_mask); + } + + ctot++; + count += dir; + if ( count < 0 ) + { + /* This will only come out once. */ + printk("Audit %d: general count whent below zero pfn=%x " + "taf=%x otaf=%x\n", + d->id, page-frame_table, + page->u.inuse.type_info, + page->tlbflush_timestamp); + } + + page->count_info = + (page->count_info & ~PGC_count_mask) | + (count & PGC_count_mask); + + } + + void scan_for_pfn(struct domain *d, unsigned long xpfn) + { + unsigned long pfn, *pt; + struct list_head *list_ent; + struct pfn_info *page; + int i; + + list_ent = d->page_list.next; + for ( i = 0; (list_ent != &d->page_list); i++ ) + { + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; + page = &frame_table[pfn]; + + switch ( page->u.inuse.type_info & PGT_type_mask ) + { + case PGT_l1_page_table: + case PGT_l2_page_table: + pt = map_domain_mem(pfn<> PAGE_SHIFT) == xpfn) ) + printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n", + d->id, i, pfn, page->u.inuse.type_info, + page->count_info); + unmap_domain_mem(pt); + } + + list_ent = frame_table[pfn].list.next; + } + + } + + void scan_for_pfn_remote(unsigned long xpfn) + { + struct domain *e; + for_each_domain ( e ) + scan_for_pfn( e, xpfn ); + } + + int i; + unsigned long pfn; + struct list_head *list_ent; + struct pfn_info *page; + + if ( d != current->domain ) + domain_pause(d); + synchronise_pagetables(~0UL); + + printk("pt base=%lx sh_info=%x\n", + pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT, + virt_to_page(d->shared_info)-frame_table); + + spin_lock(&d->page_alloc_lock); + + /* PHASE 0 */ + + list_ent = d->page_list.next; + for ( i = 0; (list_ent != &d->page_list); i++ ) + { + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; + page = &frame_table[pfn]; + + if ( page_get_owner(page) != d ) + BUG(); + + if ( (page->u.inuse.type_info & PGT_count_mask) > + (page->count_info & PGC_count_mask) ) + printk("taf > caf %x %x pfn=%lx\n", + page->u.inuse.type_info, page->count_info, pfn ); + +#if 0 /* SYSV shared memory pages plus writeable files. */ + if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && + (page->u.inuse.type_info & PGT_count_mask) > 1 ) + { + printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n", + pfn, + page->u.inuse.type_info, + page->count_info ); + scan_for_pfn_remote(pfn); + } +#endif + if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && + (page->u.inuse.type_info & PGT_count_mask) > 1 ) + { + printk("normal page with type count >1: pfn=%lx t=%x c=%x\n", + pfn, + page->u.inuse.type_info, + page->count_info ); + } + + /* Use tlbflush_timestamp to store original type_info. */ + page->tlbflush_timestamp = page->u.inuse.type_info; + + list_ent = frame_table[pfn].list.next; + } + + + /* PHASE 1 */ + + adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1); + + list_ent = d->page_list.next; + for ( i = 0; (list_ent != &d->page_list); i++ ) + { + unsigned long *pt; + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; + page = &frame_table[pfn]; + + if ( page_get_owner(page) != d ) + BUG(); + + switch ( page->u.inuse.type_info & PGT_type_mask ) + { + case PGT_l2_page_table: + + if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) + printk("Audit %d: L2 not validated %x\n", + d->id, page->u.inuse.type_info); + + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) + printk("Audit %d: L2 not pinned %x\n", + d->id, page->u.inuse.type_info); + else + adjust( page, -1, 1 ); + + pt = map_domain_mem( pfn<>PAGE_SHIFT; + struct pfn_info *l1page = &frame_table[l1pfn]; + + if ( page_get_owner(l1page) != d ) + { + printk("L2: Skip bizarre page belonging to other " + "dom %p\n", page_get_owner(l1page)); + continue; + } + + if ( (l1page->u.inuse.type_info & PGT_type_mask) == + PGT_l2_page_table ) + printk("Audit %d: [%x] Found %s Linear PT " + "t=%x pfn=%lx\n", d->id, i, + (l1pfn==pfn) ? "Self" : "Other", + l1page->u.inuse.type_info, + l1pfn); + else if ( (l1page->u.inuse.type_info & PGT_type_mask) != + PGT_l1_page_table ) + printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n", + d->id, i, + l1page->u.inuse.type_info, + l1pfn); + + adjust(l1page, -1, 1); + } + } + + unmap_domain_mem(pt); + + break; + + + case PGT_l1_page_table: + + if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) + adjust( page, -1, 1 ); + + if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) + printk("Audit %d: L1 not validated %x\n", + d->id, page->u.inuse.type_info); +#if 0 + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) + printk("Audit %d: L1 not pinned %x\n", + d->id, page->u.inuse.type_info); +#endif + pt = map_domain_mem( pfn<>PAGE_SHIFT; + struct pfn_info *l1page = &frame_table[l1pfn]; + + if ( l1pfn < 0x100 ) + { + lowmem_mappings++; + continue; + } + + if ( l1pfn > max_page ) + { + io_mappings++; + continue; + } + + if ( pt[i] & _PAGE_RW ) + { + + if ( (l1page->u.inuse.type_info & PGT_type_mask) == + PGT_l1_page_table || + (l1page->u.inuse.type_info & PGT_type_mask) == + PGT_l2_page_table ) + printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n", + d->id, i, + l1page->u.inuse.type_info, + l1pfn); + + } + + if ( page_get_owner(l1page) != d ) + { + printk("Audit %d: [%lx,%x] Skip foreign page dom=%p " + "pfn=%lx c=%08x t=%08x m2p=%lx\n", + d->id, pfn, i, + page_get_owner(l1page), + l1pfn, + l1page->count_info, + l1page->u.inuse.type_info, + machine_to_phys_mapping[l1pfn]); + continue; + } + + adjust(l1page, -1, 0); + } + } + + unmap_domain_mem(pt); + + break; + } + + list_ent = frame_table[pfn].list.next; + } + + if ( (io_mappings > 0) || (lowmem_mappings > 0) ) + printk("Audit %d: Found %d lowmem mappings and %d io mappings\n", + d->id, lowmem_mappings, io_mappings); + + /* PHASE 2 */ + + ctot = ttot = 0; + list_ent = d->page_list.next; + for ( i = 0; (list_ent != &d->page_list); i++ ) + { + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; + page = &frame_table[pfn]; + + switch ( page->u.inuse.type_info & PGT_type_mask) + { + case PGT_l1_page_table: + case PGT_l2_page_table: + if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) + { + printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n", + d->id, page->u.inuse.type_info, + page->tlbflush_timestamp, + page->count_info, pfn ); + scan_for_pfn_remote(pfn); + } + default: + if ( (page->count_info & PGC_count_mask) != 1 ) + { + printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n", + d->id, + page->count_info, + page->u.inuse.type_info, + page->tlbflush_timestamp, pfn ); + scan_for_pfn_remote(pfn); + } + break; + } + + list_ent = frame_table[pfn].list.next; + } + + /* PHASE 3 */ + list_ent = d->page_list.next; + for ( i = 0; (list_ent != &d->page_list); i++ ) + { + unsigned long *pt; + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; + page = &frame_table[pfn]; + + switch ( page->u.inuse.type_info & PGT_type_mask ) + { + case PGT_l2_page_table: + if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) + adjust( page, 1, 1 ); + + pt = map_domain_mem( pfn<>PAGE_SHIFT; + struct pfn_info *l1page; + + if (l1pfn>max_page) + continue; + + l1page = &frame_table[l1pfn]; + + if ( page_get_owner(l1page) == d ) + adjust(l1page, 1, 1); + } + } + + unmap_domain_mem(pt); + break; + + case PGT_l1_page_table: + if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) + adjust( page, 1, 1 ); + + pt = map_domain_mem( pfn<>PAGE_SHIFT; + struct pfn_info *l1page; + + if (l1pfn>max_page) + continue; + + l1page = &frame_table[l1pfn]; + + if ( (page_get_owner(l1page) != d) || + (l1pfn < 0x100) || (l1pfn > max_page) ) + continue; + + adjust(l1page, 1, 0); + } + } + + unmap_domain_mem(pt); + break; + } + + + page->tlbflush_timestamp = 0; + + list_ent = frame_table[pfn].list.next; + } + + spin_unlock(&d->page_alloc_lock); + + adjust(&frame_table[pagetable_val( + d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1); + + printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot ); + + if ( d != current->domain ) + domain_unpause(d); +} + +void audit_domains(void) +{ + struct domain *d; + for_each_domain ( d ) + audit_domain(d); +} + +void audit_domains_key(unsigned char key) +{ + audit_domains(); +} + +#endif diff -r ea98f0bb6510 -r 9f7935ea4606 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Tue Feb 08 12:27:23 2005 +0000 +++ b/xen/arch/x86/traps.c Tue Feb 08 15:13:51 2005 +0000 @@ -528,7 +528,7 @@ asmlinkage int do_general_protection(str /* Emulate some simple privileged instructions when exec'ed in ring 1. */ if ( (regs->error_code == 0) && - RING_1(regs) && + GUESTOS_FAULT(regs) && emulate_privileged_op(regs) ) return 0; diff -r ea98f0bb6510 -r 9f7935ea4606 xen/common/dom_mem_ops.c --- a/xen/common/dom_mem_ops.c Tue Feb 08 12:27:23 2005 +0000 +++ b/xen/common/dom_mem_ops.c Tue Feb 08 15:13:51 2005 +0000 @@ -122,7 +122,7 @@ free_dom_mem(struct domain *d, long do_dom_mem_op(unsigned long op, unsigned long *extent_list, - unsigned long nr_extents, + unsigned int nr_extents, unsigned int extent_order, domid_t domid) { @@ -133,8 +133,7 @@ do_dom_mem_op(unsigned long op, start_extent = op >> START_EXTENT_SHIFT; op &= (1 << START_EXTENT_SHIFT) - 1; - if ( unlikely(start_extent > nr_extents) || - unlikely(nr_extents > ~0U) ) /* can pack into a uint? */ + if ( unlikely(start_extent > nr_extents) ) return -EINVAL; if ( likely(domid == DOMID_SELF) ) @@ -150,13 +149,11 @@ do_dom_mem_op(unsigned long op, { case MEMOP_increase_reservation: rc = alloc_dom_mem( - d, extent_list, start_extent, - (unsigned int)nr_extents, extent_order); + d, extent_list, start_extent, nr_extents, extent_order); break; case MEMOP_decrease_reservation: rc = free_dom_mem( - d, extent_list, start_extent, - (unsigned int)nr_extents, extent_order); + d, extent_list, start_extent, nr_extents, extent_order); break; default: rc = -ENOSYS; diff -r ea98f0bb6510 -r 9f7935ea4606 xen/include/asm-x86/page.h --- a/xen/include/asm-x86/page.h Tue Feb 08 12:27:23 2005 +0000 +++ b/xen/include/asm-x86/page.h Tue Feb 08 15:13:51 2005 +0000 @@ -99,6 +99,13 @@ typedef struct { unsigned long l4_lo; } (((_a) >> L4_PAGETABLE_SHIFT) & (ENTRIES_PER_L4_PAGETABLE - 1)) #endif +/* Given a virtual address, get an entry offset into a linear page table. */ +#if defined(__i386__) +#define l1_linear_offset(_a) ((_a) >> PAGE_SHIFT) +#elif defined(__x86_64__) +#define l1_linear_offset(_a) (((_a) & ((1UL << 48) - 1)) >> PAGE_SHIFT) +#endif + #if defined(__i386__) #define pagetable_t l2_pgentry_t #define pagetable_val(_x) ((_x).l2_lo) diff -r ea98f0bb6510 -r 9f7935ea4606 xen/include/asm-x86/x86_32/regs.h --- a/xen/include/asm-x86/x86_32/regs.h Tue Feb 08 12:27:23 2005 +0000 +++ b/xen/include/asm-x86/x86_32/regs.h Tue Feb 08 15:13:51 2005 +0000 @@ -39,4 +39,6 @@ struct xen_regs #define RING_2(_r) (((_r)->cs & 3) == 2) #define RING_3(_r) (((_r)->cs & 3) == 3) +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_1(_r)) + #endif diff -r ea98f0bb6510 -r 9f7935ea4606 xen/include/asm-x86/x86_64/regs.h --- a/xen/include/asm-x86/x86_64/regs.h Tue Feb 08 12:27:23 2005 +0000 +++ b/xen/include/asm-x86/x86_64/regs.h Tue Feb 08 15:13:51 2005 +0000 @@ -36,4 +36,6 @@ struct xen_regs #define RING_2(_r) (((_r)->cs & 3) == 2) #define RING_3(_r) (((_r)->cs & 3) == 3) +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_3(_r)) + #endif diff -r ea98f0bb6510 -r 9f7935ea4606 xen/include/asm-x86/x86_64/uaccess.h --- a/xen/include/asm-x86/x86_64/uaccess.h Tue Feb 08 12:27:23 2005 +0000 +++ b/xen/include/asm-x86/x86_64/uaccess.h Tue Feb 08 15:13:51 2005 +0000 @@ -15,34 +15,19 @@ #define VERIFY_READ 0 #define VERIFY_WRITE 1 -#define __addr_ok(addr) ((unsigned long)(addr) < HYPERVISOR_VIRT_START) - /* - * Test whether a block of memory is a valid user space address. - * Returns 0 if the range is valid, nonzero otherwise. - * - * This is equivalent to the following test: - * ((u65)addr >= (u65)HYPERVISOR_VIRT_END) ? - * (((u65)addr + (u65)size) >= ((u65)1 << 64)) : - * (((u65)addr + (u65)size) >= ((u65)HYPERVISOR_VIRT_START)) + * Valid if in +ve half of 48-bit address space, or above Xen-reserved area. + * This is also valid for range checks (addr, addr+size). As long as the + * start address is outside the Xen-reserved area then we will access a + * non-canonical address (and thus fault) before ever reaching VIRT_START. */ -#define __range_not_ok(addr,size) ({ \ - unsigned long flag,sum; \ - if ((unsigned long)addr >= HYPERVISOR_VIRT_END) \ - asm("addq %3,%1 ; sbbq %0,%0" \ - :"=&r" (flag), "=r" (sum) \ - :"1" (addr),"g" ((long)(size))); \ - else \ - asm("addq %3,%1 ; sbbq %0,%0 ; cmpq %1,%4 ; sbbq $0,%0" \ - :"=&r" (flag), "=r" (sum) \ - :"1" (addr),"g" ((long)(size)),"r" (HYPERVISOR_VIRT_START)); \ - flag; }) +#define __addr_ok(addr) \ + (((unsigned long)(addr) < (1UL<<48)) || \ + ((unsigned long)(addr) >= HYPERVISOR_VIRT_END)) -#define access_ok(type, addr, size) (__range_not_ok(addr,size) == 0) +#define access_ok(type, addr, size) (__addr_ok(addr)) -#define array_access_ok(type,addr,count,size) \ - (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ && \ - access_ok(type,addr,(unsigned long)count*(unsigned long)size)) +#define array_access_ok(type,addr,count,size) (__addr_ok(addr)) extern long __get_user_bad(void); extern void __put_user_bad(void);