debuggers.hg

annotate xen/arch/x86/memory.c @ 3645:fd1dd0663b09

bitkeeper revision 1.1159.212.68 (42001e4d1AQiGV2pdPTNrs2AU2LjsQ)

Merge pb001.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xen-unstable.bk
into pb001.cl.cam.ac.uk:/auto/groups/xeno/users/iap10/xeno-clone/xen-unstable.bk
author iap10@pb001.cl.cam.ac.uk
date Wed Feb 02 00:26:53 2005 +0000 (2005-02-02)
parents fec8b1778268 e6af5d8f8b39
children 060c1ea52343
rev   line source
djm@1749 1 /******************************************************************************
djm@1749 2 * arch/x86/memory.c
djm@1749 3 *
djm@1749 4 * Copyright (c) 2002-2004 K A Fraser
cl349@2093 5 * Copyright (c) 2004 Christian Limpach
djm@1749 6 *
djm@1749 7 * This program is free software; you can redistribute it and/or modify
djm@1749 8 * it under the terms of the GNU General Public License as published by
djm@1749 9 * the Free Software Foundation; either version 2 of the License, or
djm@1749 10 * (at your option) any later version.
djm@1749 11 *
djm@1749 12 * This program is distributed in the hope that it will be useful,
djm@1749 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
djm@1749 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
djm@1749 15 * GNU General Public License for more details.
djm@1749 16 *
djm@1749 17 * You should have received a copy of the GNU General Public License
djm@1749 18 * along with this program; if not, write to the Free Software
djm@1749 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
djm@1749 20 */
djm@1749 21
djm@1749 22 /*
djm@1749 23 * A description of the x86 page table API:
djm@1749 24 *
djm@1749 25 * Domains trap to do_mmu_update with a list of update requests.
djm@1749 26 * This is a list of (ptr, val) pairs, where the requested operation
djm@1749 27 * is *ptr = val.
djm@1749 28 *
djm@1749 29 * Reference counting of pages:
djm@1749 30 * ----------------------------
djm@1749 31 * Each page has two refcounts: tot_count and type_count.
djm@1749 32 *
djm@1749 33 * TOT_COUNT is the obvious reference count. It counts all uses of a
djm@1749 34 * physical page frame by a domain, including uses as a page directory,
djm@1749 35 * a page table, or simple mappings via a PTE. This count prevents a
djm@1749 36 * domain from releasing a frame back to the free pool when it still holds
djm@1749 37 * a reference to it.
djm@1749 38 *
djm@1749 39 * TYPE_COUNT is more subtle. A frame can be put to one of three
djm@1749 40 * mutually-exclusive uses: it might be used as a page directory, or a
kaf24@2375 41 * page table, or it may be mapped writable by the domain [of course, a
djm@1749 42 * frame may not be used in any of these three ways!].
djm@1749 43 * So, type_count is a count of the number of times a frame is being
djm@1749 44 * referred to in its current incarnation. Therefore, a page can only
djm@1749 45 * change its type when its type count is zero.
djm@1749 46 *
djm@1749 47 * Pinning the page type:
djm@1749 48 * ----------------------
djm@1749 49 * The type of a page can be pinned/unpinned with the commands
djm@1749 50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
djm@1749 51 * pinning is not reference counted, so it can't be nested).
djm@1749 52 * This is useful to prevent a page's type count falling to zero, at which
djm@1749 53 * point safety checks would need to be carried out next time the count
djm@1749 54 * is increased again.
djm@1749 55 *
kaf24@2375 56 * A further note on writable page mappings:
kaf24@2375 57 * -----------------------------------------
kaf24@2375 58 * For simplicity, the count of writable mappings for a page may not
kaf24@2375 59 * correspond to reality. The 'writable count' is incremented for every
djm@1749 60 * PTE which maps the page with the _PAGE_RW flag set. However, for
djm@1749 61 * write access to be possible the page directory entry must also have
djm@1749 62 * its _PAGE_RW bit set. We do not check this as it complicates the
djm@1749 63 * reference counting considerably [consider the case of multiple
djm@1749 64 * directory entries referencing a single page table, some with the RW
djm@1749 65 * bit set, others not -- it starts getting a bit messy].
djm@1749 66 * In normal use, this simplification shouldn't be a problem.
djm@1749 67 * However, the logic can be added if required.
djm@1749 68 *
djm@1749 69 * One more note on read-only page mappings:
djm@1749 70 * -----------------------------------------
djm@1749 71 * We want domains to be able to map pages for read-only access. The
djm@1749 72 * main reason is that page tables and directories should be readable
kaf24@2375 73 * by a domain, but it would not be safe for them to be writable.
djm@1749 74 * However, domains have free access to rings 1 & 2 of the Intel
djm@1749 75 * privilege model. In terms of page protection, these are considered
djm@1749 76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
djm@1749 77 * read-only restrictions are respected in supervisor mode -- if the
kaf24@2375 78 * bit is clear then any mapped page is writable.
djm@1749 79 *
djm@1749 80 * We get round this by always setting the WP bit and disallowing
djm@1749 81 * updates to it. This is very unlikely to cause a problem for guest
djm@1749 82 * OS's, which will generally use the WP bit to simplify copy-on-write
djm@1749 83 * implementation (in that case, OS wants a fault when it writes to
djm@1749 84 * an application-supplied buffer).
djm@1749 85 */
djm@1749 86
djm@1749 87 #include <xen/config.h>
djm@1749 88 #include <xen/init.h>
kaf24@3392 89 #include <xen/kernel.h>
djm@1749 90 #include <xen/lib.h>
djm@1749 91 #include <xen/mm.h>
djm@1749 92 #include <xen/sched.h>
djm@1749 93 #include <xen/errno.h>
djm@1749 94 #include <xen/perfc.h>
djm@1749 95 #include <xen/irq.h>
iap10@2479 96 #include <xen/softirq.h>
kaf24@1787 97 #include <asm/shadow.h>
djm@1749 98 #include <asm/page.h>
djm@1749 99 #include <asm/flushtlb.h>
djm@1749 100 #include <asm/io.h>
djm@1749 101 #include <asm/uaccess.h>
djm@1749 102 #include <asm/domain_page.h>
djm@1749 103 #include <asm/ldt.h>
djm@1749 104
kaf24@2097 105 #ifdef VERBOSE
djm@1749 106 #define MEM_LOG(_f, _a...) \
djm@1749 107 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
cl349@2957 108 current->domain->id , __LINE__ , ## _a )
djm@1749 109 #else
djm@1749 110 #define MEM_LOG(_f, _a...) ((void)0)
djm@1749 111 #endif
djm@1749 112
djm@1749 113 static int alloc_l2_table(struct pfn_info *page);
djm@1749 114 static int alloc_l1_table(struct pfn_info *page);
djm@1749 115 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
djm@1749 116 static int get_page_and_type_from_pagenr(unsigned long page_nr,
djm@1749 117 u32 type,
djm@1749 118 struct domain *d);
djm@1749 119
djm@1749 120 static void free_l2_table(struct pfn_info *page);
djm@1749 121 static void free_l1_table(struct pfn_info *page);
djm@1749 122
djm@1749 123 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
djm@1749 124 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
djm@1749 125
djm@1749 126 /* Used to defer flushing of memory structures. */
djm@1749 127 static struct {
djm@1749 128 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
djm@1749 129 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
kaf24@3187 130 unsigned long deferred_ops;
kaf24@2314 131 /* If non-NULL, specifies a foreign subject domain for some operations. */
kaf24@3187 132 struct domain *foreign;
kaf24@3113 133 } __cacheline_aligned percpu_info[NR_CPUS];
djm@1749 134
kaf24@2314 135 /*
kaf24@2314 136 * Returns the current foreign domain; defaults to the currently-executing
kaf24@2314 137 * domain if a foreign override hasn't been specified.
kaf24@2314 138 */
cl349@2957 139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
djm@1749 140
kaf24@2336 141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
kaf24@2336 142 static struct domain *dom_xen, *dom_io;
cl349@2227 143
kaf24@3392 144 /* Frame table and its size in pages. */
kaf24@3392 145 struct pfn_info *frame_table;
kaf24@3392 146 unsigned long frame_table_size;
kaf24@3392 147 unsigned long max_page;
kaf24@3392 148
kaf24@3392 149 void __init init_frametable(void)
kaf24@3392 150 {
kaf24@3392 151 unsigned long i, p;
kaf24@3392 152
kaf24@3632 153 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
kaf24@3392 154 frame_table_size = max_page * sizeof(struct pfn_info);
kaf24@3392 155 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
kaf24@3392 156
kaf24@3392 157 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
kaf24@3392 158 {
kaf24@3392 159 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
kaf24@3392 160 if ( p == 0 )
kaf24@3392 161 panic("Not enough memory for frame table\n");
kaf24@3632 162 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
kaf24@3632 163 4UL << 20, PAGE_HYPERVISOR);
kaf24@3392 164 }
kaf24@3392 165
kaf24@3392 166 memset(frame_table, 0, frame_table_size);
kaf24@3392 167 }
kaf24@3392 168
cl349@2227 169 void arch_init_memory(void)
djm@1749 170 {
kaf24@3640 171 #ifdef __i386__
sos22@3478 172 unsigned long i;
kaf24@2336 173
kaf24@2384 174 /*
kaf24@2384 175 * We are rather picky about the layout of 'struct pfn_info'. The
kaf24@2384 176 * count_info and domain fields must be adjacent, as we perform atomic
kaf24@2384 177 * 64-bit operations on them. Also, just for sanity, we assert the size
kaf24@2384 178 * of the structure here.
kaf24@2384 179 */
kaf24@2384 180 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
kaf24@2384 181 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
kaf24@2384 182 (sizeof(struct pfn_info) != 24) )
kaf24@2384 183 {
kaf24@2384 184 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
kaf24@2384 185 offsetof(struct pfn_info, count_info),
kaf24@2384 186 offsetof(struct pfn_info, u.inuse.domain),
kaf24@2384 187 sizeof(struct pfn_info));
kaf24@2384 188 for ( ; ; ) ;
kaf24@2384 189 }
kaf24@2384 190
djm@1749 191 memset(percpu_info, 0, sizeof(percpu_info));
cl349@2227 192
kaf24@2336 193 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
kaf24@2336 194 memset(machine_to_phys_mapping, 0x55, 4<<20);
kaf24@2336 195
kaf24@2336 196 /*
kaf24@2336 197 * Initialise our DOMID_XEN domain.
kaf24@2336 198 * Any Xen-heap pages that we will allow to be mapped will have
kaf24@2336 199 * their domain field set to dom_xen.
kaf24@2336 200 */
kaf24@2336 201 dom_xen = alloc_domain_struct();
kaf24@2336 202 atomic_set(&dom_xen->refcnt, 1);
kaf24@2748 203 dom_xen->id = DOMID_XEN;
kaf24@2336 204
kaf24@2336 205 /*
kaf24@2336 206 * Initialise our DOMID_IO domain.
kaf24@2336 207 * This domain owns no pages but is considered a special case when
kaf24@2336 208 * mapping I/O pages, as the mappings occur at the priv of the caller.
kaf24@2336 209 */
kaf24@2336 210 dom_io = alloc_domain_struct();
kaf24@2336 211 atomic_set(&dom_io->refcnt, 1);
kaf24@2748 212 dom_io->id = DOMID_IO;
kaf24@2336 213
kaf24@2336 214 /* M2P table is mappable read-only by privileged domains. */
kaf24@3392 215 for ( i = 0; i < 1024; i++ )
kaf24@2336 216 {
sos22@3478 217 frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
sos22@3478 218 /* gdt to make sure it's only mapped read-only by non-privileged
sos22@3478 219 domains. */
sos22@3478 220 frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
sos22@3478 221 frame_table[m2p_start_mfn+i].u.inuse.domain = dom_xen;
kaf24@2336 222 }
kaf24@3640 223 #endif
djm@1749 224 }
djm@1749 225
cl349@2957 226 static void __invalidate_shadow_ldt(struct exec_domain *d)
djm@1749 227 {
djm@1749 228 int i;
djm@1749 229 unsigned long pfn;
djm@1749 230 struct pfn_info *page;
djm@1749 231
djm@1749 232 d->mm.shadow_ldt_mapcnt = 0;
djm@1749 233
djm@1749 234 for ( i = 16; i < 32; i++ )
djm@1749 235 {
cl349@3036 236 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_ptes[i]);
djm@1749 237 if ( pfn == 0 ) continue;
cl349@3036 238 d->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
djm@1749 239 page = &frame_table[pfn];
djm@1749 240 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
cl349@3036 241 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
djm@1749 242 put_page_and_type(page);
djm@1749 243 }
djm@1749 244
djm@1749 245 /* Dispose of the (now possibly invalid) mappings from the TLB. */
djm@1749 246 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
djm@1749 247 }
djm@1749 248
djm@1749 249
cl349@2957 250 static inline void invalidate_shadow_ldt(struct exec_domain *d)
djm@1749 251 {
djm@1749 252 if ( d->mm.shadow_ldt_mapcnt != 0 )
djm@1749 253 __invalidate_shadow_ldt(d);
djm@1749 254 }
djm@1749 255
djm@1749 256
kaf24@2336 257 static int alloc_segdesc_page(struct pfn_info *page)
djm@1749 258 {
djm@1749 259 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
djm@1749 260 int i;
djm@1749 261
djm@1749 262 for ( i = 0; i < 512; i++ )
kaf24@1854 263 if ( unlikely(!check_descriptor(&descs[i*2])) )
djm@1749 264 goto fail;
djm@1749 265
djm@1749 266 unmap_domain_mem(descs);
djm@1749 267 return 1;
djm@1749 268
djm@1749 269 fail:
djm@1749 270 unmap_domain_mem(descs);
djm@1749 271 return 0;
djm@1749 272 }
djm@1749 273
djm@1749 274
djm@1749 275 /* Map shadow page at offset @off. */
djm@1749 276 int map_ldt_shadow_page(unsigned int off)
djm@1749 277 {
cl349@2957 278 struct exec_domain *ed = current;
cl349@2957 279 struct domain *d = ed->domain;
djm@1749 280 unsigned long l1e;
djm@1749 281
djm@1749 282 if ( unlikely(in_irq()) )
djm@1749 283 BUG();
djm@1749 284
cl349@2957 285 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->mm.ldt_base >>
djm@1749 286 PAGE_SHIFT) + off]);
djm@1749 287
djm@1749 288 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
djm@1749 289 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
djm@1749 290 d, PGT_ldt_page)) )
djm@1749 291 return 0;
djm@1749 292
cl349@3036 293 ed->mm.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
cl349@2957 294 ed->mm.shadow_ldt_mapcnt++;
djm@1749 295
djm@1749 296 return 1;
djm@1749 297 }
djm@1749 298
djm@1749 299
djm@1749 300 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
djm@1749 301 {
djm@1749 302 struct pfn_info *page = &frame_table[page_nr];
djm@1749 303
djm@1749 304 if ( unlikely(!pfn_is_ram(page_nr)) )
djm@1749 305 {
djm@1749 306 MEM_LOG("Pfn %08lx is not RAM", page_nr);
djm@1749 307 return 0;
djm@1749 308 }
djm@1749 309
djm@1749 310 if ( unlikely(!get_page(page, d)) )
djm@1749 311 {
djm@1749 312 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
djm@1749 313 return 0;
djm@1749 314 }
djm@1749 315
djm@1749 316 return 1;
djm@1749 317 }
djm@1749 318
djm@1749 319
djm@1749 320 static int get_page_and_type_from_pagenr(unsigned long page_nr,
djm@1749 321 u32 type,
djm@1749 322 struct domain *d)
djm@1749 323 {
djm@1749 324 struct pfn_info *page = &frame_table[page_nr];
djm@1749 325
djm@1749 326 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
djm@1749 327 return 0;
djm@1749 328
djm@1749 329 if ( unlikely(!get_page_type(page, type)) )
djm@1749 330 {
cl349@2450 331 #ifdef VERBOSE
cl349@2491 332 if ( (type & PGT_type_mask) != PGT_l1_page_table )
cl349@2491 333 MEM_LOG("Bad page type for pfn %08lx (%08x)",
cl349@2491 334 page_nr, page->u.inuse.type_info);
cl349@2450 335 #endif
djm@1749 336 put_page(page);
djm@1749 337 return 0;
djm@1749 338 }
djm@1749 339
djm@1749 340 return 1;
djm@1749 341 }
djm@1749 342
djm@1749 343
djm@1749 344 /*
djm@1749 345 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
djm@1749 346 * needs some special care with reference counst and access permissions:
djm@1749 347 * 1. The mapping entry must be read-only, or the guest may get write access
djm@1749 348 * to its own PTEs.
djm@1749 349 * 2. We must only bump the reference counts for an *already validated*
djm@1749 350 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
djm@1749 351 * on a validation that is required to complete that validation.
djm@1749 352 * 3. We only need to increment the reference counts for the mapped page
djm@1749 353 * frame if it is mapped by a different L2 table. This is sufficient and
djm@1749 354 * also necessary to allow validation of an L2 table mapping itself.
djm@1749 355 */
kaf24@2314 356 static int
kaf24@2314 357 get_linear_pagetable(
kaf24@2314 358 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
djm@1749 359 {
djm@1749 360 u32 x, y;
djm@1749 361 struct pfn_info *page;
djm@1749 362
djm@1749 363 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
djm@1749 364 {
djm@1749 365 MEM_LOG("Attempt to create linear p.t. with write perms");
djm@1749 366 return 0;
djm@1749 367 }
djm@1749 368
djm@1749 369 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
djm@1749 370 {
djm@1749 371 /* Make sure the mapped frame belongs to the correct domain. */
kaf24@2314 372 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
djm@1749 373 return 0;
djm@1749 374
djm@1749 375 /*
djm@1749 376 * Make sure that the mapped frame is an already-validated L2 table.
djm@1749 377 * If so, atomically increment the count (checking for overflow).
djm@1749 378 */
djm@1749 379 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
kaf24@1970 380 y = page->u.inuse.type_info;
djm@1749 381 do {
djm@1749 382 x = y;
djm@1749 383 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
djm@1749 384 unlikely((x & (PGT_type_mask|PGT_validated)) !=
djm@1749 385 (PGT_l2_page_table|PGT_validated)) )
djm@1749 386 {
djm@1749 387 put_page(page);
djm@1749 388 return 0;
djm@1749 389 }
djm@1749 390 }
kaf24@1970 391 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
djm@1749 392 }
djm@1749 393
djm@1749 394 return 1;
djm@1749 395 }
djm@1749 396
djm@1749 397
kaf24@2314 398 static int
kaf24@2314 399 get_page_from_l1e(
kaf24@2314 400 l1_pgentry_t l1e, struct domain *d)
djm@1749 401 {
djm@1749 402 unsigned long l1v = l1_pgentry_val(l1e);
djm@1749 403 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
kaf24@2382 404 struct pfn_info *page = &frame_table[pfn];
djm@1749 405 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
djm@1749 406
djm@1749 407 if ( !(l1v & _PAGE_PRESENT) )
djm@1749 408 return 1;
djm@1749 409
djm@1749 410 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
djm@1749 411 {
djm@1749 412 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
djm@1749 413 return 0;
djm@1749 414 }
djm@1749 415
djm@1749 416 if ( unlikely(!pfn_is_ram(pfn)) )
djm@1749 417 {
kaf24@2336 418 /* Revert to caller privileges if FD == DOMID_IO. */
kaf24@2336 419 if ( d == dom_io )
cl349@2957 420 d = current->domain;
kaf24@2336 421
kaf24@2336 422 if ( IS_PRIV(d) )
djm@1749 423 return 1;
djm@1749 424
kaf24@2336 425 if ( IS_CAPABLE_PHYSDEV(d) )
kaf24@2336 426 return domain_iomem_in_pfn(d, pfn);
djm@1749 427
djm@1749 428 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
djm@1749 429 return 0;
djm@1749 430 }
djm@1749 431
kaf24@2756 432 return ((l1v & _PAGE_RW) ?
kaf24@2756 433 get_page_and_type(page, d, PGT_writable_page) :
kaf24@2757 434 get_page(page, d));
djm@1749 435 }
djm@1749 436
djm@1749 437
djm@1749 438 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
kaf24@2314 439 static int
kaf24@2314 440 get_page_from_l2e(
kaf24@2466 441 l2_pgentry_t l2e, unsigned long pfn,
kaf24@2466 442 struct domain *d, unsigned long va_idx)
djm@1749 443 {
iap10@2458 444 int rc;
iap10@2458 445
djm@1749 446 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
djm@1749 447 return 1;
djm@1749 448
djm@1749 449 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
djm@1749 450 {
djm@1749 451 MEM_LOG("Bad L2 page type settings %04lx",
djm@1749 452 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
djm@1749 453 return 0;
djm@1749 454 }
djm@1749 455
iap10@2458 456 rc = get_page_and_type_from_pagenr(
iap10@2458 457 l2_pgentry_to_pagenr(l2e),
kaf24@2466 458 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
iap10@2458 459
iap10@2458 460 if ( unlikely(!rc) )
kaf24@2314 461 return get_linear_pagetable(l2e, pfn, d);
djm@1749 462
djm@1749 463 return 1;
djm@1749 464 }
djm@1749 465
djm@1749 466
kaf24@2382 467 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
djm@1749 468 {
djm@1749 469 unsigned long l1v = l1_pgentry_val(l1e);
kaf24@2385 470 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
kaf24@2385 471 struct pfn_info *page = &frame_table[pfn];
iap10@3424 472 struct domain *e;
djm@1749 473
kaf24@2385 474 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
djm@1749 475 return;
djm@1749 476
iap10@3424 477 e = page->u.inuse.domain;
kaf24@2382 478 if ( unlikely(e != d) )
kaf24@2382 479 {
kaf24@2382 480 /*
kaf24@2382 481 * Unmap a foreign page that may have been mapped via a grant table.
kaf24@2382 482 * Note that this can fail for a privileged domain that can map foreign
kaf24@2382 483 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
kaf24@2382 484 * counted via a grant entry and some counted directly in the page
kaf24@2382 485 * structure's reference count. Note that reference counts won't get
kaf24@2382 486 * dangerously confused as long as we always try to decrement the
kaf24@2382 487 * grant entry first. We may end up with a mismatch between which
kaf24@2382 488 * mappings and which unmappings are counted via the grant entry, but
kaf24@2382 489 * really it doesn't matter as privileged domains have carte blanche.
kaf24@2382 490 */
kaf24@2655 491 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
kaf24@2382 492 return;
kaf24@2382 493 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
kaf24@2382 494 }
kaf24@2382 495
djm@1749 496 if ( l1v & _PAGE_RW )
djm@1749 497 {
djm@1749 498 put_page_and_type(page);
djm@1749 499 }
djm@1749 500 else
djm@1749 501 {
djm@1749 502 /* We expect this is rare so we blow the entire shadow LDT. */
kaf24@1970 503 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
djm@1749 504 PGT_ldt_page)) &&
kaf24@1970 505 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
cl349@2957 506 invalidate_shadow_ldt(e->exec_domain[0]);
djm@1749 507 put_page(page);
djm@1749 508 }
djm@1749 509 }
djm@1749 510
djm@1749 511
djm@1749 512 /*
djm@1749 513 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
djm@1749 514 * Note also that this automatically deals correctly with linear p.t.'s.
djm@1749 515 */
djm@1749 516 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
djm@1749 517 {
djm@1749 518 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
djm@1749 519 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
djm@1749 520 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
djm@1749 521 }
djm@1749 522
djm@1749 523
djm@1749 524 static int alloc_l2_table(struct pfn_info *page)
djm@1749 525 {
kaf24@2314 526 struct domain *d = page->u.inuse.domain;
kaf24@2314 527 unsigned long page_nr = page_to_pfn(page);
kaf24@2314 528 l2_pgentry_t *pl2e;
kaf24@2314 529 int i;
djm@1749 530
djm@1749 531 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 532
kaf24@3392 533 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
iap10@2458 534 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
djm@1749 535 goto fail;
kaf24@3392 536
djm@1749 537 #if defined(__i386__)
djm@1749 538 /* Now we add our private high mappings. */
djm@1749 539 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
djm@1749 540 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
djm@1749 541 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
djm@1749 542 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
djm@1749 543 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
djm@1749 544 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
cl349@3036 545 mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) |
djm@1749 546 __PAGE_HYPERVISOR);
djm@1749 547 #endif
djm@1749 548
djm@1749 549 unmap_domain_mem(pl2e);
djm@1749 550 return 1;
djm@1749 551
djm@1749 552 fail:
djm@1749 553 while ( i-- > 0 )
djm@1749 554 put_page_from_l2e(pl2e[i], page_nr);
djm@1749 555
djm@1749 556 unmap_domain_mem(pl2e);
djm@1749 557 return 0;
djm@1749 558 }
djm@1749 559
djm@1749 560
djm@1749 561 static int alloc_l1_table(struct pfn_info *page)
djm@1749 562 {
kaf24@2314 563 struct domain *d = page->u.inuse.domain;
kaf24@2314 564 unsigned long page_nr = page_to_pfn(page);
kaf24@2314 565 l1_pgentry_t *pl1e;
kaf24@2314 566 int i;
djm@1749 567
djm@1749 568 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 569
djm@1749 570 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2314 571 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
djm@1749 572 goto fail;
djm@1749 573
djm@1749 574 unmap_domain_mem(pl1e);
djm@1749 575 return 1;
djm@1749 576
djm@1749 577 fail:
djm@1749 578 while ( i-- > 0 )
kaf24@2382 579 put_page_from_l1e(pl1e[i], d);
djm@1749 580
djm@1749 581 unmap_domain_mem(pl1e);
djm@1749 582 return 0;
djm@1749 583 }
djm@1749 584
djm@1749 585
djm@1749 586 static void free_l2_table(struct pfn_info *page)
djm@1749 587 {
djm@1749 588 unsigned long page_nr = page - frame_table;
djm@1749 589 l2_pgentry_t *pl2e;
djm@1749 590 int i;
djm@1749 591
djm@1749 592 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 593
djm@1749 594 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
djm@1749 595 put_page_from_l2e(pl2e[i], page_nr);
djm@1749 596
djm@1749 597 unmap_domain_mem(pl2e);
djm@1749 598 }
djm@1749 599
djm@1749 600
djm@1749 601 static void free_l1_table(struct pfn_info *page)
djm@1749 602 {
kaf24@2382 603 struct domain *d = page->u.inuse.domain;
djm@1749 604 unsigned long page_nr = page - frame_table;
djm@1749 605 l1_pgentry_t *pl1e;
djm@1749 606 int i;
djm@1749 607
djm@1749 608 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 609
djm@1749 610 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2382 611 put_page_from_l1e(pl1e[i], d);
djm@1749 612
djm@1749 613 unmap_domain_mem(pl1e);
djm@1749 614 }
djm@1749 615
djm@1749 616
djm@1749 617 static inline int update_l2e(l2_pgentry_t *pl2e,
djm@1749 618 l2_pgentry_t ol2e,
djm@1749 619 l2_pgentry_t nl2e)
djm@1749 620 {
djm@1749 621 unsigned long o = cmpxchg((unsigned long *)pl2e,
djm@1749 622 l2_pgentry_val(ol2e),
djm@1749 623 l2_pgentry_val(nl2e));
djm@1749 624 if ( o != l2_pgentry_val(ol2e) )
djm@1749 625 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
djm@1749 626 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
djm@1749 627 return (o == l2_pgentry_val(ol2e));
djm@1749 628 }
djm@1749 629
djm@1749 630
djm@1749 631 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
djm@1749 632 static int mod_l2_entry(l2_pgentry_t *pl2e,
djm@1749 633 l2_pgentry_t nl2e,
djm@1749 634 unsigned long pfn)
djm@1749 635 {
djm@1749 636 l2_pgentry_t ol2e;
djm@1749 637 unsigned long _ol2e;
djm@1749 638
djm@1749 639 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
djm@1749 640 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
djm@1749 641 {
djm@1749 642 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
djm@1749 643 return 0;
djm@1749 644 }
djm@1749 645
djm@1749 646 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
djm@1749 647 return 0;
djm@1749 648 ol2e = mk_l2_pgentry(_ol2e);
djm@1749 649
djm@1749 650 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
djm@1749 651 {
djm@1749 652 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
djm@1749 653 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
djm@1749 654 return update_l2e(pl2e, ol2e, nl2e);
djm@1749 655
cl349@2957 656 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
cl349@2491 657 ((unsigned long)pl2e &
kaf24@2466 658 ~PAGE_MASK) >> 2)) )
djm@1749 659 return 0;
cl349@1860 660
djm@1749 661 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
djm@1749 662 {
djm@1749 663 put_page_from_l2e(nl2e, pfn);
djm@1749 664 return 0;
djm@1749 665 }
djm@1749 666
djm@1749 667 put_page_from_l2e(ol2e, pfn);
djm@1749 668 return 1;
djm@1749 669 }
djm@1749 670
djm@1749 671 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
djm@1749 672 return 0;
djm@1749 673
djm@1749 674 put_page_from_l2e(ol2e, pfn);
djm@1749 675 return 1;
djm@1749 676 }
djm@1749 677
djm@1749 678
djm@1749 679 static inline int update_l1e(l1_pgentry_t *pl1e,
djm@1749 680 l1_pgentry_t ol1e,
djm@1749 681 l1_pgentry_t nl1e)
djm@1749 682 {
djm@1749 683 unsigned long o = l1_pgentry_val(ol1e);
djm@1749 684 unsigned long n = l1_pgentry_val(nl1e);
djm@1749 685
djm@1749 686 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
djm@1749 687 unlikely(o != l1_pgentry_val(ol1e)) )
djm@1749 688 {
djm@1749 689 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
djm@1749 690 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
djm@1749 691 return 0;
djm@1749 692 }
djm@1749 693
djm@1749 694 return 1;
djm@1749 695 }
djm@1749 696
djm@1749 697
djm@1749 698 /* Update the L1 entry at pl1e to new value nl1e. */
djm@1749 699 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
djm@1749 700 {
djm@1749 701 l1_pgentry_t ol1e;
djm@1749 702 unsigned long _ol1e;
cl349@2957 703 struct domain *d = current->domain;
djm@1749 704
djm@1749 705 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
djm@1749 706 {
djm@1749 707 MEM_LOG("Bad get_user\n");
djm@1749 708 return 0;
djm@1749 709 }
djm@1749 710
djm@1749 711 ol1e = mk_l1_pgentry(_ol1e);
djm@1749 712
djm@1749 713 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
djm@1749 714 {
djm@1749 715 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
djm@1749 716 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
djm@1749 717 return update_l1e(pl1e, ol1e, nl1e);
djm@1749 718
kaf24@2314 719 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
djm@1749 720 return 0;
djm@1749 721
djm@1749 722 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
djm@1749 723 {
kaf24@2382 724 put_page_from_l1e(nl1e, d);
djm@1749 725 return 0;
djm@1749 726 }
djm@1749 727
kaf24@2382 728 put_page_from_l1e(ol1e, d);
djm@1749 729 return 1;
djm@1749 730 }
djm@1749 731
djm@1749 732 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
djm@1749 733 return 0;
djm@1749 734
kaf24@2382 735 put_page_from_l1e(ol1e, d);
djm@1749 736 return 1;
djm@1749 737 }
djm@1749 738
djm@1749 739
djm@1749 740 int alloc_page_type(struct pfn_info *page, unsigned int type)
djm@1749 741 {
djm@1749 742 switch ( type )
djm@1749 743 {
djm@1749 744 case PGT_l1_page_table:
djm@1749 745 return alloc_l1_table(page);
djm@1749 746 case PGT_l2_page_table:
djm@1749 747 return alloc_l2_table(page);
djm@1749 748 case PGT_gdt_page:
djm@1749 749 case PGT_ldt_page:
djm@1749 750 return alloc_segdesc_page(page);
djm@1749 751 default:
cl349@2491 752 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
cl349@2491 753 type, page->u.inuse.type_info,
cl349@2491 754 page->count_info);
djm@1749 755 BUG();
djm@1749 756 }
djm@1749 757
djm@1749 758 return 0;
djm@1749 759 }
djm@1749 760
djm@1749 761
djm@1749 762 void free_page_type(struct pfn_info *page, unsigned int type)
djm@1749 763 {
kaf24@2314 764 struct domain *d = page->u.inuse.domain;
kaf24@2314 765
djm@1749 766 switch ( type )
djm@1749 767 {
djm@1749 768 case PGT_l1_page_table:
djm@1749 769 free_l1_table(page);
djm@1749 770 break;
djm@1749 771
djm@1749 772 case PGT_l2_page_table:
djm@1749 773 free_l2_table(page);
djm@1749 774 break;
djm@1749 775
djm@1749 776 default:
djm@1749 777 BUG();
djm@1749 778 }
kaf24@2314 779
cl349@2957 780 if ( unlikely(d->exec_domain[0]->mm.shadow_mode) &&
cl349@2957 781 (get_shadow_status(&d->exec_domain[0]->mm, page_to_pfn(page)) & PSH_shadowed) )
kaf24@2314 782 {
kaf24@2314 783 unshadow_table(page_to_pfn(page), type);
cl349@2957 784 put_shadow_status(&d->exec_domain[0]->mm);
kaf24@2314 785 }
djm@1749 786 }
djm@1749 787
djm@1749 788
kaf24@2498 789 void put_page_type(struct pfn_info *page)
kaf24@2498 790 {
kaf24@2498 791 u32 nx, x, y = page->u.inuse.type_info;
kaf24@2498 792
kaf24@2498 793 again:
kaf24@2498 794 do {
kaf24@2498 795 x = y;
kaf24@2498 796 nx = x - 1;
kaf24@2498 797
kaf24@2498 798 ASSERT((x & PGT_count_mask) != 0);
kaf24@2588 799
kaf24@2588 800 /*
kaf24@2588 801 * The page should always be validated while a reference is held. The
kaf24@2588 802 * exception is during domain destruction, when we forcibly invalidate
kaf24@2588 803 * page-table pages if we detect a referential loop.
kaf24@2588 804 * See domain.c:relinquish_list().
kaf24@2588 805 */
kaf24@2588 806 ASSERT((x & PGT_validated) ||
cl349@3036 807 test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
kaf24@2498 808
kaf24@2498 809 if ( unlikely((nx & PGT_count_mask) == 0) )
kaf24@2498 810 {
kaf24@2498 811 /* Record TLB information for flush later. Races are harmless. */
kaf24@2790 812 page->tlbflush_timestamp = tlbflush_current_time();
kaf24@2498 813
kaf24@2588 814 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
kaf24@2588 815 likely(nx & PGT_validated) )
kaf24@2498 816 {
kaf24@2498 817 /*
kaf24@2498 818 * Page-table pages must be unvalidated when count is zero. The
kaf24@2498 819 * 'free' is safe because the refcnt is non-zero and validated
kaf24@2498 820 * bit is clear => other ops will spin or fail.
kaf24@2498 821 */
kaf24@2498 822 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
kaf24@2498 823 x & ~PGT_validated)) != x) )
kaf24@2498 824 goto again;
kaf24@2498 825 /* We cleared the 'valid bit' so we do the clear up. */
kaf24@2498 826 free_page_type(page, x & PGT_type_mask);
kaf24@2498 827 /* Carry on, but with the 'valid bit' now clear. */
kaf24@2498 828 x &= ~PGT_validated;
kaf24@2498 829 nx &= ~PGT_validated;
kaf24@2498 830 }
kaf24@2498 831 }
cl349@2644 832 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
kaf24@2498 833 (PGT_pinned | 1)) )
cl349@2644 834 {
kaf24@2498 835 /* Page is now only pinned. Make the back pointer mutable again. */
cl349@2644 836 nx |= PGT_va_mutable;
cl349@2644 837 }
kaf24@2498 838 }
kaf24@2498 839 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
kaf24@2498 840 }
kaf24@2498 841
kaf24@2498 842
kaf24@2498 843 int get_page_type(struct pfn_info *page, u32 type)
kaf24@2498 844 {
kaf24@2498 845 u32 nx, x, y = page->u.inuse.type_info;
kaf24@2498 846
kaf24@2498 847 again:
kaf24@2498 848 do {
kaf24@2498 849 x = y;
kaf24@2498 850 nx = x + 1;
kaf24@2498 851 if ( unlikely((nx & PGT_count_mask) == 0) )
kaf24@2498 852 {
kaf24@2498 853 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
kaf24@2498 854 return 0;
kaf24@2498 855 }
kaf24@2498 856 else if ( unlikely((x & PGT_count_mask) == 0) )
kaf24@2498 857 {
kaf24@2498 858 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
kaf24@2498 859 {
kaf24@2498 860 /*
kaf24@2498 861 * On type change we check to flush stale TLB entries. This
kaf24@2498 862 * may be unnecessary (e.g., page was GDT/LDT) but those
kaf24@2498 863 * circumstances should be very rare.
kaf24@2498 864 */
kaf24@2498 865 struct domain *d = page->u.inuse.domain;
cl349@2957 866 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
kaf24@2498 867 page->tlbflush_timestamp)) )
kaf24@2498 868 {
kaf24@2498 869 perfc_incr(need_flush_tlb_flush);
cl349@2957 870 flush_tlb_cpu(d->exec_domain[0]->processor);
kaf24@2498 871 }
kaf24@2498 872
kaf24@2498 873 /* We lose existing type, back pointer, and validity. */
kaf24@2498 874 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
kaf24@2498 875 nx |= type;
kaf24@2498 876
kaf24@2498 877 /* No special validation needed for writable pages. */
kaf24@2498 878 /* Page tables and GDT/LDT need to be scanned for validity. */
kaf24@2498 879 if ( type == PGT_writable_page )
kaf24@2498 880 nx |= PGT_validated;
kaf24@2498 881 }
kaf24@2498 882 }
kaf24@2498 883 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
kaf24@2498 884 {
kaf24@2498 885 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
kaf24@2498 886 {
kaf24@2498 887 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
kaf24@2498 888 ((type & PGT_type_mask) != PGT_l1_page_table) )
kaf24@2498 889 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
kaf24@2498 890 x & PGT_type_mask, type, page_to_pfn(page));
kaf24@2498 891 return 0;
kaf24@2498 892 }
kaf24@2498 893 else if ( (x & PGT_va_mask) == PGT_va_mutable )
kaf24@2498 894 {
kaf24@2498 895 /* The va backpointer is mutable, hence we update it. */
kaf24@2498 896 nx &= ~PGT_va_mask;
kaf24@2498 897 nx |= type; /* we know the actual type is correct */
kaf24@2498 898 }
kaf24@2498 899 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
kaf24@2498 900 {
kaf24@2506 901 /* This table is potentially mapped at multiple locations. */
kaf24@2506 902 nx &= ~PGT_va_mask;
kaf24@2506 903 nx |= PGT_va_unknown;
kaf24@2498 904 }
kaf24@2498 905 }
cl349@2644 906 else if ( unlikely(!(x & PGT_validated)) )
kaf24@2498 907 {
kaf24@2498 908 /* Someone else is updating validation of this page. Wait... */
kaf24@2498 909 while ( (y = page->u.inuse.type_info) == x )
kaf24@2498 910 {
kaf24@2498 911 rep_nop();
kaf24@2498 912 barrier();
kaf24@2498 913 }
kaf24@2498 914 goto again;
kaf24@2498 915 }
kaf24@2498 916 }
kaf24@2498 917 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
kaf24@2498 918
kaf24@2498 919 if ( unlikely(!(nx & PGT_validated)) )
kaf24@2498 920 {
kaf24@2498 921 /* Try to validate page type; drop the new reference on failure. */
kaf24@2498 922 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
kaf24@2498 923 {
kaf24@2498 924 MEM_LOG("Error while validating pfn %08lx for type %08x."
kaf24@2498 925 " caf=%08x taf=%08x\n",
kaf24@2498 926 page_to_pfn(page), type,
cl349@2644 927 page->count_info,
cl349@2644 928 page->u.inuse.type_info);
kaf24@2498 929 /* Noone else can get a reference. We hold the only ref. */
kaf24@2498 930 page->u.inuse.type_info = 0;
kaf24@2498 931 return 0;
kaf24@2498 932 }
kaf24@2498 933
kaf24@2498 934 /* Noone else is updating simultaneously. */
kaf24@2498 935 __set_bit(_PGT_validated, &page->u.inuse.type_info);
kaf24@2498 936 }
kaf24@2498 937
kaf24@2498 938 return 1;
kaf24@2498 939 }
kaf24@2498 940
kaf24@2498 941
kaf24@3443 942 int new_guest_cr3(unsigned long pfn)
kaf24@3443 943 {
kaf24@3443 944 struct exec_domain *ed = current;
kaf24@3443 945 struct domain *d = ed->domain;
kaf24@3443 946 int okay, cpu = smp_processor_id();
kaf24@3443 947 unsigned long old_base_pfn;
kaf24@3443 948
kaf24@3443 949 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
kaf24@3443 950 if ( likely(okay) )
kaf24@3443 951 {
kaf24@3443 952 invalidate_shadow_ldt(ed);
kaf24@3443 953
kaf24@3443 954 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
kaf24@3443 955 old_base_pfn = pagetable_val(ed->mm.pagetable) >> PAGE_SHIFT;
kaf24@3443 956 ed->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
kaf24@3443 957
kaf24@3443 958 shadow_mk_pagetable(&ed->mm);
kaf24@3443 959
kaf24@3443 960 write_ptbase(&ed->mm);
kaf24@3443 961
kaf24@3443 962 put_page_and_type(&frame_table[old_base_pfn]);
kaf24@3443 963 }
kaf24@3443 964 else
kaf24@3443 965 {
kaf24@3517 966 MEM_LOG("Error while installing new baseptr %08lx", pfn);
kaf24@3443 967 }
kaf24@3443 968
kaf24@3443 969 return okay;
kaf24@3443 970 }
kaf24@3443 971
djm@1749 972 static int do_extended_command(unsigned long ptr, unsigned long val)
djm@1749 973 {
djm@1749 974 int okay = 1, cpu = smp_processor_id();
djm@1749 975 unsigned int cmd = val & MMUEXT_CMD_MASK;
djm@1749 976 unsigned long pfn = ptr >> PAGE_SHIFT;
djm@1749 977 struct pfn_info *page = &frame_table[pfn];
cl349@2957 978 struct exec_domain *ed = current;
cl349@2957 979 struct domain *d = ed->domain, *nd, *e;
djm@1749 980 u32 x, y;
djm@1749 981 domid_t domid;
kaf24@2385 982 grant_ref_t gntref;
djm@1749 983
djm@1749 984 switch ( cmd )
djm@1749 985 {
kaf24@2465 986 case MMUEXT_PIN_L1_TABLE:
kaf24@2465 987 case MMUEXT_PIN_L2_TABLE:
kaf24@2466 988 /*
kaf24@2466 989 * We insist that, if you pin an L1 page, it's the first thing that
kaf24@2466 990 * you do to it. This is because we require the backptr to still be
kaf24@2466 991 * mutable. This assumption seems safe.
kaf24@2466 992 */
djm@1749 993 okay = get_page_and_type_from_pagenr(
kaf24@2465 994 pfn,
kaf24@2465 995 ((cmd==MMUEXT_PIN_L2_TABLE) ?
cl349@2491 996 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
kaf24@2465 997 FOREIGNDOM);
iap10@2458 998
djm@1749 999 if ( unlikely(!okay) )
djm@1749 1000 {
djm@1749 1001 MEM_LOG("Error while pinning pfn %08lx", pfn);
djm@1749 1002 break;
djm@1749 1003 }
djm@1749 1004
kaf24@2466 1005 if ( unlikely(test_and_set_bit(_PGT_pinned,
kaf24@2466 1006 &page->u.inuse.type_info)) )
djm@1749 1007 {
djm@1749 1008 MEM_LOG("Pfn %08lx already pinned", pfn);
djm@1749 1009 put_page_and_type(page);
djm@1749 1010 okay = 0;
djm@1749 1011 break;
djm@1749 1012 }
djm@1749 1013
djm@1749 1014 break;
djm@1749 1015
djm@1749 1016 case MMUEXT_UNPIN_TABLE:
kaf24@2314 1017 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
djm@1749 1018 {
djm@1749 1019 MEM_LOG("Page %08lx bad domain (dom=%p)",
kaf24@1970 1020 ptr, page->u.inuse.domain);
djm@1749 1021 }
kaf24@2466 1022 else if ( likely(test_and_clear_bit(_PGT_pinned,
kaf24@2466 1023 &page->u.inuse.type_info)) )
djm@1749 1024 {
djm@1749 1025 put_page_and_type(page);
djm@1749 1026 put_page(page);
djm@1749 1027 }
djm@1749 1028 else
djm@1749 1029 {
djm@1749 1030 okay = 0;
djm@1749 1031 put_page(page);
djm@1749 1032 MEM_LOG("Pfn %08lx not pinned", pfn);
djm@1749 1033 }
djm@1749 1034 break;
djm@1749 1035
djm@1749 1036 case MMUEXT_NEW_BASEPTR:
kaf24@3443 1037 okay = new_guest_cr3(pfn);
djm@1749 1038 break;
djm@1749 1039
djm@1749 1040 case MMUEXT_TLB_FLUSH:
djm@1749 1041 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
djm@1749 1042 break;
djm@1749 1043
djm@1749 1044 case MMUEXT_INVLPG:
djm@1749 1045 __flush_tlb_one(ptr);
djm@1749 1046 break;
djm@1749 1047
kaf24@2463 1048 case MMUEXT_FLUSH_CACHE:
kaf24@2463 1049 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
kaf24@2463 1050 {
kaf24@2463 1051 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
kaf24@2463 1052 okay = 0;
kaf24@2463 1053 }
kaf24@2463 1054 else
kaf24@2463 1055 {
kaf24@2463 1056 wbinvd();
kaf24@2463 1057 }
kaf24@2463 1058 break;
kaf24@2463 1059
djm@1749 1060 case MMUEXT_SET_LDT:
djm@1749 1061 {
djm@1749 1062 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
djm@1749 1063 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
djm@1749 1064 (ents > 8192) ||
djm@1749 1065 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
djm@1749 1066 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
djm@1749 1067 {
djm@1749 1068 okay = 0;
djm@1749 1069 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
djm@1749 1070 }
cl349@2957 1071 else if ( (ed->mm.ldt_ents != ents) ||
cl349@2957 1072 (ed->mm.ldt_base != ptr) )
djm@1749 1073 {
cl349@2957 1074 invalidate_shadow_ldt(ed);
cl349@2957 1075 ed->mm.ldt_base = ptr;
cl349@2957 1076 ed->mm.ldt_ents = ents;
cl349@2957 1077 load_LDT(ed);
djm@1749 1078 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
djm@1749 1079 if ( ents != 0 )
djm@1749 1080 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
djm@1749 1081 }
djm@1749 1082 break;
djm@1749 1083 }
djm@1749 1084
kaf24@2314 1085 case MMUEXT_SET_FOREIGNDOM:
kaf24@2314 1086 domid = (domid_t)(val >> 16);
djm@1749 1087
kaf24@2362 1088 if ( (e = percpu_info[cpu].foreign) != NULL )
kaf24@2362 1089 put_domain(e);
kaf24@2362 1090 percpu_info[cpu].foreign = NULL;
kaf24@2362 1091
djm@1749 1092 if ( !IS_PRIV(d) )
djm@1749 1093 {
kaf24@2336 1094 switch ( domid )
kaf24@2336 1095 {
kaf24@2336 1096 case DOMID_IO:
kaf24@2362 1097 get_knownalive_domain(dom_io);
kaf24@2362 1098 percpu_info[cpu].foreign = dom_io;
kaf24@2336 1099 break;
kaf24@2336 1100 default:
kaf24@2748 1101 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
kaf24@2336 1102 okay = 0;
kaf24@2336 1103 break;
kaf24@2336 1104 }
djm@1749 1105 }
djm@1749 1106 else
djm@1749 1107 {
kaf24@2314 1108 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
kaf24@2314 1109 if ( e == NULL )
djm@1749 1110 {
kaf24@2336 1111 switch ( domid )
kaf24@2336 1112 {
kaf24@2336 1113 case DOMID_XEN:
kaf24@2362 1114 get_knownalive_domain(dom_xen);
kaf24@2362 1115 percpu_info[cpu].foreign = dom_xen;
kaf24@2336 1116 break;
kaf24@2336 1117 case DOMID_IO:
kaf24@2362 1118 get_knownalive_domain(dom_io);
kaf24@2362 1119 percpu_info[cpu].foreign = dom_io;
kaf24@2336 1120 break;
kaf24@2336 1121 default:
kaf24@2336 1122 MEM_LOG("Unknown domain '%u'", domid);
kaf24@2336 1123 okay = 0;
kaf24@2336 1124 break;
kaf24@2336 1125 }
djm@1749 1126 }
djm@1749 1127 }
djm@1749 1128 break;
djm@1749 1129
kaf24@2385 1130 case MMUEXT_TRANSFER_PAGE:
kaf24@2385 1131 domid = (domid_t)(val >> 16);
kaf24@2385 1132 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
kaf24@2385 1133
kaf24@2385 1134 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
kaf24@2385 1135 unlikely(!pfn_is_ram(pfn)) ||
kaf24@2385 1136 unlikely((e = find_domain_by_id(domid)) == NULL) )
kaf24@2385 1137 {
kaf24@2385 1138 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
kaf24@2385 1139 okay = 0;
kaf24@2385 1140 break;
kaf24@2385 1141 }
kaf24@2385 1142
kaf24@2385 1143 spin_lock(&d->page_alloc_lock);
kaf24@2385 1144
kaf24@2385 1145 /*
kaf24@2385 1146 * The tricky bit: atomically release ownership while there is just one
kaf24@2385 1147 * benign reference to the page (PGC_allocated). If that reference
kaf24@2385 1148 * disappears then the deallocation routine will safely spin.
kaf24@2385 1149 */
kaf24@2385 1150 nd = page->u.inuse.domain;
kaf24@2385 1151 y = page->count_info;
kaf24@2385 1152 do {
kaf24@2385 1153 x = y;
kaf24@2385 1154 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
kaf24@2385 1155 (1|PGC_allocated)) ||
kaf24@2385 1156 unlikely(nd != d) )
kaf24@2385 1157 {
kaf24@2385 1158 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
kaf24@2385 1159 " caf=%08x, taf=%08x\n", page_to_pfn(page),
kaf24@2748 1160 d, d->id, nd, x, page->u.inuse.type_info);
kaf24@2385 1161 spin_unlock(&d->page_alloc_lock);
kaf24@2385 1162 put_domain(e);
kaf24@2663 1163 return 0;
kaf24@2385 1164 }
kaf24@2385 1165 __asm__ __volatile__(
kaf24@2385 1166 LOCK_PREFIX "cmpxchg8b %2"
kaf24@2385 1167 : "=d" (nd), "=a" (y),
kaf24@2385 1168 "=m" (*(volatile u64 *)(&page->count_info))
kaf24@2385 1169 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
kaf24@2385 1170 }
kaf24@2385 1171 while ( unlikely(nd != d) || unlikely(y != x) );
kaf24@2385 1172
kaf24@2385 1173 /*
kaf24@2385 1174 * Unlink from 'd'. At least one reference remains (now anonymous), so
kaf24@2385 1175 * noone else is spinning to try to delete this page from 'd'.
kaf24@2385 1176 */
kaf24@2385 1177 d->tot_pages--;
kaf24@2385 1178 list_del(&page->list);
kaf24@2385 1179
kaf24@2385 1180 spin_unlock(&d->page_alloc_lock);
kaf24@2385 1181
kaf24@2385 1182 spin_lock(&e->page_alloc_lock);
kaf24@2385 1183
kaf24@2466 1184 /*
kaf24@2466 1185 * Check that 'e' will accept the page and has reservation headroom.
kaf24@2466 1186 * Also, a domain mustn't have PGC_allocated pages when it is dying.
kaf24@2466 1187 */
kaf24@2385 1188 ASSERT(e->tot_pages <= e->max_pages);
cl349@2957 1189 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
kaf24@2466 1190 unlikely(e->tot_pages == e->max_pages) ||
kaf24@2385 1191 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
kaf24@2385 1192 {
kaf24@2431 1193 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
kaf24@2469 1194 "provided a bad grant ref, or is dying (%08lx).\n",
cl349@2957 1195 e->tot_pages, e->max_pages, e->d_flags);
kaf24@2385 1196 spin_unlock(&e->page_alloc_lock);
kaf24@2385 1197 put_domain(e);
kaf24@2385 1198 okay = 0;
kaf24@2385 1199 break;
kaf24@2385 1200 }
kaf24@2385 1201
kaf24@2385 1202 /* Okay, add the page to 'e'. */
kaf24@2385 1203 if ( unlikely(e->tot_pages++ == 0) )
kaf24@2385 1204 get_knownalive_domain(e);
kaf24@2385 1205 list_add_tail(&page->list, &e->page_list);
kaf24@2385 1206 page->u.inuse.domain = e;
kaf24@2385 1207
kaf24@2385 1208 spin_unlock(&e->page_alloc_lock);
kaf24@2385 1209
kaf24@2385 1210 /* Transfer is all done: tell the guest about its new page frame. */
kaf24@2385 1211 gnttab_notify_transfer(e, gntref, pfn);
kaf24@2385 1212
kaf24@2385 1213 put_domain(e);
kaf24@2385 1214 break;
kaf24@2385 1215
djm@1749 1216 case MMUEXT_REASSIGN_PAGE:
djm@1749 1217 if ( unlikely(!IS_PRIV(d)) )
djm@1749 1218 {
kaf24@2748 1219 MEM_LOG("Dom %u has no reassignment priv", d->id);
djm@1749 1220 okay = 0;
djm@1749 1221 break;
djm@1749 1222 }
djm@1749 1223
kaf24@2314 1224 e = percpu_info[cpu].foreign;
kaf24@2314 1225 if ( unlikely(e == NULL) )
djm@1749 1226 {
kaf24@2314 1227 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
djm@1749 1228 okay = 0;
djm@1749 1229 break;
djm@1749 1230 }
djm@1749 1231
djm@1749 1232 /*
djm@1749 1233 * Grab both page_list locks, in order. This prevents the page from
djm@1749 1234 * disappearing elsewhere while we modify the owner, and we'll need
djm@1749 1235 * both locks if we're successful so that we can change lists.
djm@1749 1236 */
djm@1749 1237 if ( d < e )
djm@1749 1238 {
djm@1749 1239 spin_lock(&d->page_alloc_lock);
djm@1749 1240 spin_lock(&e->page_alloc_lock);
djm@1749 1241 }
djm@1749 1242 else
djm@1749 1243 {
djm@1749 1244 spin_lock(&e->page_alloc_lock);
djm@1749 1245 spin_lock(&d->page_alloc_lock);
djm@1749 1246 }
djm@1749 1247
djm@1749 1248 /* A domain shouldn't have PGC_allocated pages when it is dying. */
cl349@2957 1249 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
djm@1749 1250 unlikely(IS_XEN_HEAP_FRAME(page)) )
djm@1749 1251 {
kaf24@1871 1252 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
djm@1749 1253 okay = 0;
djm@1749 1254 goto reassign_fail;
djm@1749 1255 }
djm@1749 1256
djm@1749 1257 /*
djm@1749 1258 * The tricky bit: atomically change owner while there is just one
djm@1749 1259 * benign reference to the page (PGC_allocated). If that reference
djm@1749 1260 * disappears then the deallocation routine will safely spin.
djm@1749 1261 */
kaf24@1970 1262 nd = page->u.inuse.domain;
kaf24@2384 1263 y = page->count_info;
djm@1749 1264 do {
djm@1749 1265 x = y;
djm@1749 1266 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
djm@1749 1267 (1|PGC_allocated)) ||
djm@1749 1268 unlikely(nd != d) )
djm@1749 1269 {
djm@1749 1270 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
djm@1749 1271 " caf=%08x, taf=%08x\n", page_to_pfn(page),
kaf24@2748 1272 d, d->id, nd, x, page->u.inuse.type_info);
djm@1749 1273 okay = 0;
djm@1749 1274 goto reassign_fail;
djm@1749 1275 }
djm@1749 1276 __asm__ __volatile__(
djm@1749 1277 LOCK_PREFIX "cmpxchg8b %3"
kaf24@2384 1278 : "=d" (nd), "=a" (y), "=c" (e),
kaf24@2384 1279 "=m" (*(volatile u64 *)(&page->count_info))
kaf24@2384 1280 : "0" (d), "1" (x), "c" (e), "b" (x) );
djm@1749 1281 }
djm@1749 1282 while ( unlikely(nd != d) || unlikely(y != x) );
djm@1749 1283
djm@1749 1284 /*
djm@1749 1285 * Unlink from 'd'. We transferred at least one reference to 'e', so
djm@1749 1286 * noone else is spinning to try to delete this page from 'd'.
djm@1749 1287 */
djm@1749 1288 d->tot_pages--;
djm@1749 1289 list_del(&page->list);
djm@1749 1290
djm@1749 1291 /*
djm@1749 1292 * Add the page to 'e'. Someone may already have removed the last
djm@1749 1293 * reference and want to remove the page from 'e'. However, we have
djm@1749 1294 * the lock so they'll spin waiting for us.
djm@1749 1295 */
djm@1749 1296 if ( unlikely(e->tot_pages++ == 0) )
kaf24@2336 1297 get_knownalive_domain(e);
djm@1749 1298 list_add_tail(&page->list, &e->page_list);
djm@1749 1299
djm@1749 1300 reassign_fail:
djm@1749 1301 spin_unlock(&d->page_alloc_lock);
djm@1749 1302 spin_unlock(&e->page_alloc_lock);
djm@1749 1303 break;
djm@1749 1304
kaf24@2314 1305 case MMUEXT_CLEAR_FOREIGNDOM:
kaf24@2314 1306 if ( (e = percpu_info[cpu].foreign) != NULL )
kaf24@2314 1307 put_domain(e);
kaf24@2314 1308 percpu_info[cpu].foreign = NULL;
djm@1749 1309 break;
djm@1749 1310
djm@1749 1311 default:
djm@1749 1312 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
djm@1749 1313 okay = 0;
djm@1749 1314 break;
djm@1749 1315 }
djm@1749 1316
djm@1749 1317 return okay;
djm@1749 1318 }
djm@1749 1319
kaf24@3177 1320 int do_mmu_update(
kaf24@3177 1321 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
kaf24@3177 1322 {
kaf24@3177 1323 /*
kaf24@3177 1324 * We steal the m.s.b. of the @count parameter to indicate whether this
kaf24@3177 1325 * invocation of do_mmu_update() is resuming a previously preempted call.
kaf24@3187 1326 * We steal the next 15 bits to remember the current FOREIGNDOM.
kaf24@3177 1327 */
kaf24@3187 1328 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
kaf24@3187 1329 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
kaf24@3187 1330 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
djm@1749 1331
djm@1749 1332 mmu_update_t req;
djm@1749 1333 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
djm@1749 1334 struct pfn_info *page;
kaf24@3187 1335 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
kaf24@3177 1336 unsigned int cmd, done = 0;
djm@1749 1337 unsigned long prev_spfn = 0;
djm@1749 1338 l1_pgentry_t *prev_spl1e = 0;
cl349@2957 1339 struct exec_domain *ed = current;
cl349@2957 1340 struct domain *d = ed->domain;
kaf24@2466 1341 u32 type_info;
kaf24@3187 1342 domid_t domid;
djm@1749 1343
cl349@3036 1344 LOCK_BIGLOCK(d);
cl349@3036 1345
kaf24@3517 1346 cleanup_writable_pagetable(d);
kaf24@2375 1347
kaf24@3177 1348 /*
kaf24@3177 1349 * If we are resuming after preemption, read how much work we have already
kaf24@3177 1350 * done. This allows us to set the @done output parameter correctly.
kaf24@3187 1351 * We also reset FOREIGNDOM here.
kaf24@3177 1352 */
kaf24@3187 1353 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
kaf24@3177 1354 {
kaf24@3187 1355 if ( !(count & MMU_UPDATE_PREEMPTED) )
kaf24@3187 1356 {
kaf24@3187 1357 /* Count overflow into private FOREIGNDOM field. */
kaf24@3187 1358 MEM_LOG("do_mmu_update count is too large");
kaf24@3187 1359 rc = -EINVAL;
kaf24@3187 1360 goto out;
kaf24@3187 1361 }
kaf24@3177 1362 count &= ~MMU_UPDATE_PREEMPTED;
kaf24@3187 1363 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
kaf24@3187 1364 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
kaf24@3177 1365 if ( unlikely(pdone != NULL) )
kaf24@3177 1366 (void)get_user(done, pdone);
cl349@3193 1367 if ( (domid != current->domain->id) &&
kaf24@3187 1368 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
kaf24@3187 1369 {
kaf24@3187 1370 rc = -EINVAL;
kaf24@3187 1371 goto out;
kaf24@3187 1372 }
kaf24@3177 1373 }
kaf24@3177 1374
kaf24@3269 1375 perfc_incrc(calls_to_mmu_update);
kaf24@3269 1376 perfc_addc(num_page_updates, count);
kaf24@3269 1377
kaf24@3177 1378 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
kaf24@3187 1379 {
kaf24@3187 1380 rc = -EFAULT;
kaf24@3187 1381 goto out;
kaf24@3187 1382 }
cl349@1860 1383
djm@1749 1384 for ( i = 0; i < count; i++ )
djm@1749 1385 {
kaf24@3177 1386 if ( hypercall_preempt_check() )
kaf24@3177 1387 {
kaf24@3187 1388 rc = hypercall_create_continuation(
kaf24@3177 1389 __HYPERVISOR_mmu_update, 3, ureqs,
kaf24@3187 1390 (count - i) |
kaf24@3187 1391 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
kaf24@3187 1392 MMU_UPDATE_PREEMPTED, pdone);
kaf24@3177 1393 break;
kaf24@3177 1394 }
kaf24@3129 1395
kaf24@2375 1396 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
djm@1749 1397 {
kaf24@2375 1398 MEM_LOG("Bad __copy_from_user");
djm@1749 1399 rc = -EFAULT;
djm@1749 1400 break;
djm@1749 1401 }
djm@1749 1402
djm@1749 1403 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
djm@1749 1404 pfn = req.ptr >> PAGE_SHIFT;
djm@1749 1405
djm@1749 1406 okay = 0;
djm@1749 1407
djm@1749 1408 switch ( cmd )
djm@1749 1409 {
djm@1749 1410 /*
djm@1749 1411 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
djm@1749 1412 */
djm@1749 1413 case MMU_NORMAL_PT_UPDATE:
cl349@2957 1414 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
djm@1749 1415 {
djm@1749 1416 MEM_LOG("Could not get page for normal update");
djm@1749 1417 break;
djm@1749 1418 }
djm@1749 1419
djm@1749 1420 if ( likely(prev_pfn == pfn) )
djm@1749 1421 {
djm@1749 1422 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
djm@1749 1423 }
djm@1749 1424 else
djm@1749 1425 {
djm@1749 1426 if ( prev_pfn != 0 )
djm@1749 1427 unmap_domain_mem((void *)va);
djm@1749 1428 va = (unsigned long)map_domain_mem(req.ptr);
djm@1749 1429 prev_pfn = pfn;
djm@1749 1430 }
djm@1749 1431
djm@1749 1432 page = &frame_table[pfn];
kaf24@2466 1433 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
djm@1749 1434 {
djm@1749 1435 case PGT_l1_page_table:
kaf24@2466 1436 if ( likely(get_page_type(
kaf24@2466 1437 page, type_info & (PGT_type_mask|PGT_va_mask))) )
djm@1749 1438 {
djm@1749 1439 okay = mod_l1_entry((l1_pgentry_t *)va,
djm@1749 1440 mk_l1_pgentry(req.val));
djm@1749 1441
cl349@2957 1442 if ( unlikely(ed->mm.shadow_mode) && okay &&
cl349@2957 1443 (get_shadow_status(&ed->mm, page-frame_table) &
djm@1749 1444 PSH_shadowed) )
djm@1749 1445 {
kaf24@2375 1446 shadow_l1_normal_pt_update(
kaf24@2375 1447 req.ptr, req.val, &prev_spfn, &prev_spl1e);
cl349@2957 1448 put_shadow_status(&ed->mm);
djm@1749 1449 }
djm@1749 1450
djm@1749 1451 put_page_type(page);
djm@1749 1452 }
djm@1749 1453 break;
djm@1749 1454 case PGT_l2_page_table:
djm@1749 1455 if ( likely(get_page_type(page, PGT_l2_page_table)) )
djm@1749 1456 {
djm@1749 1457 okay = mod_l2_entry((l2_pgentry_t *)va,
djm@1749 1458 mk_l2_pgentry(req.val),
djm@1749 1459 pfn);
djm@1749 1460
cl349@2957 1461 if ( unlikely(ed->mm.shadow_mode) && okay &&
cl349@2957 1462 (get_shadow_status(&ed->mm, page-frame_table) &
djm@1749 1463 PSH_shadowed) )
djm@1749 1464 {
kaf24@2375 1465 shadow_l2_normal_pt_update(req.ptr, req.val);
cl349@2957 1466 put_shadow_status(&ed->mm);
djm@1749 1467 }
djm@1749 1468
djm@1749 1469 put_page_type(page);
djm@1749 1470 }
djm@1749 1471 break;
djm@1749 1472 default:
kaf24@2375 1473 if ( likely(get_page_type(page, PGT_writable_page)) )
djm@1749 1474 {
djm@1749 1475 *(unsigned long *)va = req.val;
djm@1749 1476 okay = 1;
djm@1749 1477 put_page_type(page);
djm@1749 1478 }
djm@1749 1479 break;
djm@1749 1480 }
djm@1749 1481
djm@1749 1482 put_page(page);
djm@1749 1483 break;
djm@1749 1484
djm@1749 1485 case MMU_MACHPHYS_UPDATE:
kaf24@2314 1486 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
djm@1749 1487 {
djm@1749 1488 MEM_LOG("Could not get page for mach->phys update");
djm@1749 1489 break;
djm@1749 1490 }
djm@1749 1491
djm@1749 1492 machine_to_phys_mapping[pfn] = req.val;
djm@1749 1493 okay = 1;
djm@1749 1494
djm@1749 1495 /*
djm@1749 1496 * If in log-dirty mode, mark the corresponding pseudo-physical
djm@1749 1497 * page as dirty.
djm@1749 1498 */
cl349@2957 1499 if ( unlikely(ed->mm.shadow_mode == SHM_logdirty) &&
cl349@2957 1500 mark_dirty(&ed->mm, pfn) )
cl349@2957 1501 ed->mm.shadow_dirty_block_count++;
djm@1749 1502
djm@1749 1503 put_page(&frame_table[pfn]);
djm@1749 1504 break;
djm@1749 1505
djm@1749 1506 /*
djm@1749 1507 * MMU_EXTENDED_COMMAND: Extended command is specified
djm@1749 1508 * in the least-siginificant bits of the 'value' field.
djm@1749 1509 */
djm@1749 1510 case MMU_EXTENDED_COMMAND:
djm@1749 1511 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
djm@1749 1512 okay = do_extended_command(req.ptr, req.val);
djm@1749 1513 break;
djm@1749 1514
djm@1749 1515 default:
djm@1749 1516 MEM_LOG("Invalid page update command %08lx", req.ptr);
djm@1749 1517 break;
djm@1749 1518 }
djm@1749 1519
djm@1749 1520 if ( unlikely(!okay) )
djm@1749 1521 {
djm@1749 1522 rc = -EINVAL;
djm@1749 1523 break;
djm@1749 1524 }
djm@1749 1525
djm@1749 1526 ureqs++;
djm@1749 1527 }
djm@1749 1528
kaf24@3187 1529 out:
djm@1749 1530 if ( prev_pfn != 0 )
djm@1749 1531 unmap_domain_mem((void *)va);
djm@1749 1532
kaf24@2375 1533 if ( unlikely(prev_spl1e != 0) )
djm@1749 1534 unmap_domain_mem((void *)prev_spl1e);
djm@1749 1535
djm@1749 1536 deferred_ops = percpu_info[cpu].deferred_ops;
djm@1749 1537 percpu_info[cpu].deferred_ops = 0;
djm@1749 1538
djm@1749 1539 if ( deferred_ops & DOP_FLUSH_TLB )
djm@1749 1540 local_flush_tlb();
kaf24@2375 1541
djm@1749 1542 if ( deferred_ops & DOP_RELOAD_LDT )
djm@1749 1543 (void)map_ldt_shadow_page(0);
djm@1749 1544
kaf24@2314 1545 if ( unlikely(percpu_info[cpu].foreign != NULL) )
djm@1749 1546 {
kaf24@2314 1547 put_domain(percpu_info[cpu].foreign);
kaf24@2314 1548 percpu_info[cpu].foreign = NULL;
djm@1749 1549 }
djm@1749 1550
kaf24@3177 1551 /* Add incremental work we have done to the @done output parameter. */
kaf24@3177 1552 if ( unlikely(pdone != NULL) )
kaf24@3177 1553 __put_user(done + i, pdone);
djm@1749 1554
cl349@3036 1555 UNLOCK_BIGLOCK(d);
djm@1749 1556 return rc;
djm@1749 1557 }
djm@1749 1558
djm@1749 1559
djm@1749 1560 int do_update_va_mapping(unsigned long page_nr,
djm@1749 1561 unsigned long val,
djm@1749 1562 unsigned long flags)
djm@1749 1563 {
cl349@2957 1564 struct exec_domain *ed = current;
cl349@2957 1565 struct domain *d = ed->domain;
djm@1749 1566 int err = 0;
cl349@2957 1567 unsigned int cpu = ed->processor;
djm@1749 1568 unsigned long deferred_ops;
djm@1749 1569
djm@1749 1570 perfc_incrc(calls_to_update_va);
djm@1749 1571
djm@1749 1572 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
djm@1749 1573 return -EINVAL;
djm@1749 1574
cl349@3036 1575 LOCK_BIGLOCK(d);
cl349@3036 1576
kaf24@3517 1577 cleanup_writable_pagetable(d);
cl349@1879 1578
djm@1749 1579 /*
djm@1749 1580 * XXX When we make this support 4MB superpages we should also deal with
djm@1749 1581 * the case of updating L2 entries.
djm@1749 1582 */
djm@1749 1583
djm@1749 1584 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
djm@1749 1585 mk_l1_pgentry(val))) )
djm@1749 1586 err = -EINVAL;
djm@1749 1587
cl349@2957 1588 if ( unlikely(ed->mm.shadow_mode) )
djm@1749 1589 {
djm@1749 1590 unsigned long sval;
djm@1749 1591
cl349@2957 1592 l1pte_propagate_from_guest(&ed->mm, &val, &sval);
djm@1749 1593
djm@1749 1594 if ( unlikely(__put_user(sval, ((unsigned long *)(
djm@1749 1595 &shadow_linear_pg_table[page_nr])))) )
djm@1749 1596 {
djm@1749 1597 /*
djm@1749 1598 * Since L2's are guranteed RW, failure indicates the page was not
djm@1749 1599 * shadowed, so ignore.
djm@1749 1600 */
djm@1749 1601 perfc_incrc(shadow_update_va_fail);
djm@1749 1602 }
djm@1749 1603
djm@1749 1604 /*
djm@1749 1605 * If we're in log-dirty mode then we need to note that we've updated
djm@1749 1606 * the PTE in the PT-holding page. We need the machine frame number
djm@1749 1607 * for this.
djm@1749 1608 */
cl349@2957 1609 if ( ed->mm.shadow_mode == SHM_logdirty )
kaf24@2673 1610 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
djm@1749 1611
cl349@2957 1612 check_pagetable(&ed->mm, ed->mm.pagetable, "va"); /* debug */
djm@1749 1613 }
djm@1749 1614
djm@1749 1615 deferred_ops = percpu_info[cpu].deferred_ops;
djm@1749 1616 percpu_info[cpu].deferred_ops = 0;
djm@1749 1617
djm@1749 1618 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
djm@1749 1619 unlikely(flags & UVMF_FLUSH_TLB) )
djm@1749 1620 local_flush_tlb();
djm@1749 1621 else if ( unlikely(flags & UVMF_INVLPG) )
djm@1749 1622 __flush_tlb_one(page_nr << PAGE_SHIFT);
djm@1749 1623
djm@1749 1624 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
djm@1749 1625 (void)map_ldt_shadow_page(0);
djm@1749 1626
cl349@3036 1627 UNLOCK_BIGLOCK(d);
cl349@3036 1628
djm@1749 1629 return err;
djm@1749 1630 }
djm@1749 1631
djm@1749 1632 int do_update_va_mapping_otherdomain(unsigned long page_nr,
djm@1749 1633 unsigned long val,
djm@1749 1634 unsigned long flags,
djm@1749 1635 domid_t domid)
djm@1749 1636 {
djm@1749 1637 unsigned int cpu = smp_processor_id();
djm@1749 1638 struct domain *d;
djm@1749 1639 int rc;
djm@1749 1640
cl349@2957 1641 if ( unlikely(!IS_PRIV(current->domain)) )
djm@1749 1642 return -EPERM;
djm@1749 1643
kaf24@2314 1644 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
djm@1749 1645 if ( unlikely(d == NULL) )
djm@1749 1646 {
djm@1749 1647 MEM_LOG("Unknown domain '%u'", domid);
djm@1749 1648 return -ESRCH;
djm@1749 1649 }
djm@1749 1650
djm@1749 1651 rc = do_update_va_mapping(page_nr, val, flags);
djm@1749 1652
djm@1749 1653 put_domain(d);
kaf24@2314 1654 percpu_info[cpu].foreign = NULL;
djm@1749 1655
djm@1749 1656 return rc;
djm@1749 1657 }
cl349@1879 1658
cl349@1879 1659
cl349@1921 1660
kaf24@2382 1661 /*************************
kaf24@2382 1662 * Writable Pagetables
kaf24@2382 1663 */
cl349@2093 1664
kaf24@2663 1665 ptwr_info_t ptwr_info[NR_CPUS];
cl349@1894 1666
kaf24@2097 1667 #ifdef VERBOSE
cl349@2496 1668 int ptwr_debug = 0x0;
kaf24@2654 1669 #define PTWR_PRINTK(_f, _a...) \
kaf24@2654 1670 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
cl349@2652 1671 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
cl349@2093 1672 #else
kaf24@2654 1673 #define PTWR_PRINTK(_f, _a...) ((void)0)
cl349@2093 1674 #endif
cl349@1879 1675
kaf24@2663 1676 /* Flush the given writable p.t. page and write-protect it again. */
cl349@2512 1677 void ptwr_flush(const int which)
cl349@1879 1678 {
kaf24@2663 1679 unsigned long sstat, spte, pte, *ptep, l1va;
kaf24@2663 1680 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
kaf24@3022 1681 l2_pgentry_t *pl2e;
kaf24@2663 1682 int i, cpu = smp_processor_id();
cl349@2957 1683 struct exec_domain *ed = current;
cl349@2957 1684 struct domain *d = ed->domain;
cl349@1879 1685
iap10@2640 1686 l1va = ptwr_info[cpu].ptinfo[which].l1va;
cl349@2644 1687 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
cl349@1913 1688
kaf24@2663 1689 /*
kaf24@2663 1690 * STEP 1. Write-protect the p.t. page so no more updates can occur.
kaf24@2663 1691 */
kaf24@2663 1692
kaf24@2663 1693 if ( unlikely(__get_user(pte, ptep)) )
kaf24@2663 1694 {
cl349@2512 1695 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
kaf24@2707 1696 /*
kaf24@2707 1697 * Really a bug. We could read this PTE during the initial fault,
kaf24@2841 1698 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
kaf24@2707 1699 */
kaf24@2707 1700 BUG();
cl349@2414 1701 }
kaf24@2654 1702 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
kaf24@2654 1703 PTWR_PRINT_WHICH, ptep, pte);
cl349@2631 1704 pte &= ~_PAGE_RW;
iap10@2640 1705
cl349@2957 1706 if ( unlikely(ed->mm.shadow_mode) )
kaf24@2663 1707 {
kaf24@2663 1708 /* Write-protect the p.t. page in the shadow page table. */
cl349@2957 1709 l1pte_propagate_from_guest(&ed->mm, &pte, &spte);
kaf24@2663 1710 __put_user(
kaf24@2663 1711 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
kaf24@2663 1712
kaf24@2663 1713 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
cl349@2957 1714 sstat = get_shadow_status(&ed->mm, pte >> PAGE_SHIFT);
kaf24@2663 1715 if ( sstat & PSH_shadowed )
kaf24@2663 1716 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
iap10@2640 1717 }
iap10@2640 1718
kaf24@2663 1719 /* Write-protect the p.t. page in the guest page table. */
kaf24@2663 1720 if ( unlikely(__put_user(pte, ptep)) )
kaf24@2663 1721 {
cl349@2512 1722 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
kaf24@2707 1723 /*
kaf24@2707 1724 * Really a bug. We could write this PTE during the initial fault,
kaf24@2841 1725 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
kaf24@2707 1726 */
kaf24@2707 1727 BUG();
cl349@2414 1728 }
kaf24@2663 1729
kaf24@2663 1730 /* Ensure that there are no stale writable mappings in any TLB. */
kaf24@2841 1731 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
cl349@3325 1732 #if 1
kaf24@2841 1733 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
cl349@3036 1734 #else
cl349@3036 1735 flush_tlb_all();
cl349@3036 1736 #endif
kaf24@2654 1737 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
kaf24@2654 1738 PTWR_PRINT_WHICH, ptep, pte);
cl349@2631 1739
kaf24@2663 1740 /*
kaf24@2663 1741 * STEP 2. Validate any modified PTEs.
kaf24@2663 1742 */
kaf24@2663 1743
cl349@2631 1744 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
kaf24@2663 1745 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2663 1746 {
cl349@2631 1747 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
cl349@2631 1748 nl1e = pl1e[i];
kaf24@2663 1749
kaf24@2663 1750 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
cl349@2631 1751 continue;
kaf24@2663 1752
kaf24@2663 1753 /*
kaf24@2663 1754 * Fast path for PTEs that have merely been write-protected
kaf24@2663 1755 * (e.g., during a Unix fork()). A strict reduction in privilege.
kaf24@2663 1756 */
kaf24@2663 1757 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
kaf24@2663 1758 {
kaf24@2663 1759 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
kaf24@2663 1760 {
kaf24@2663 1761 if ( unlikely(sl1e != NULL) )
kaf24@2673 1762 l1pte_propagate_from_guest(
cl349@2957 1763 &ed->mm, &l1_pgentry_val(nl1e),
kaf24@2663 1764 &l1_pgentry_val(sl1e[i]));
kaf24@2663 1765 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
kaf24@2663 1766 }
cl349@2644 1767 continue;
kaf24@2663 1768 }
kaf24@2663 1769
kaf24@2663 1770 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
kaf24@2663 1771 {
cl349@2631 1772 MEM_LOG("ptwr: Could not re-validate l1 page\n");
kaf24@2707 1773 /*
kaf24@2707 1774 * Make the remaining p.t's consistent before crashing, so the
kaf24@2707 1775 * reference counts are correct.
kaf24@2707 1776 */
kaf24@2707 1777 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
kaf24@2707 1778 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
cl349@2708 1779 unmap_domain_mem(pl1e);
cl349@2708 1780 ptwr_info[cpu].ptinfo[which].l1va = 0;
cl349@3036 1781 UNLOCK_BIGLOCK(d);
cl349@2631 1782 domain_crash();
cl349@2631 1783 }
kaf24@2663 1784
kaf24@2663 1785 if ( unlikely(sl1e != NULL) )
kaf24@2673 1786 l1pte_propagate_from_guest(
cl349@2957 1787 &ed->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
kaf24@2663 1788
kaf24@2663 1789 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
kaf24@2663 1790 put_page_from_l1e(ol1e, d);
cl349@2631 1791 }
cl349@2631 1792 unmap_domain_mem(pl1e);
cl349@2631 1793
kaf24@2663 1794 /*
kaf24@2663 1795 * STEP 3. Reattach the L1 p.t. page into the current address space.
kaf24@2663 1796 */
kaf24@2663 1797
cl349@2957 1798 if ( (which == PTWR_PT_ACTIVE) && likely(!ed->mm.shadow_mode) )
kaf24@2663 1799 {
kaf24@2663 1800 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
kaf24@3022 1801 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
cl349@2631 1802 }
iap10@2509 1803
kaf24@2663 1804 /*
kaf24@2663 1805 * STEP 4. Final tidy-up.
kaf24@2663 1806 */
iap10@2509 1807
cl349@2512 1808 ptwr_info[cpu].ptinfo[which].l1va = 0;
kaf24@2663 1809
kaf24@2663 1810 if ( unlikely(sl1e != NULL) )
kaf24@2663 1811 {
kaf24@2663 1812 unmap_domain_mem(sl1e);
cl349@2957 1813 put_shadow_status(&ed->mm);
kaf24@2663 1814 }
cl349@1879 1815 }
cl349@1879 1816
kaf24@2663 1817 /* Write page fault handler: check if guest is trying to modify a PTE. */
cl349@1879 1818 int ptwr_do_page_fault(unsigned long addr)
cl349@1879 1819 {
kaf24@3022 1820 unsigned long pte, pfn, l2e;
cl349@1879 1821 struct pfn_info *page;
kaf24@3022 1822 l2_pgentry_t *pl2e;
kaf24@2663 1823 int which, cpu = smp_processor_id();
kaf24@2663 1824 u32 l2_idx;
iap10@2458 1825
kaf24@2663 1826 /*
kaf24@2663 1827 * Attempt to read the PTE that maps the VA being accessed. By checking for
kaf24@2663 1828 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
kaf24@2663 1829 */
kaf24@2663 1830 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
kaf24@2663 1831 _PAGE_PRESENT) ||
kaf24@2663 1832 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
cl349@3036 1833 {
kaf24@2663 1834 return 0;
cl349@3036 1835 }
iap10@2509 1836
kaf24@2663 1837 pfn = pte >> PAGE_SHIFT;
kaf24@2663 1838 page = &frame_table[pfn];
cl349@1915 1839
kaf24@2663 1840 /* We are looking only for read-only mappings of p.t. pages. */
kaf24@2663 1841 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
kaf24@2663 1842 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
cl349@3036 1843 {
kaf24@2663 1844 return 0;
cl349@3036 1845 }
kaf24@2663 1846
kaf24@2663 1847 /* Get the L2 index at which this L1 p.t. is always mapped. */
kaf24@2663 1848 l2_idx = page->u.inuse.type_info & PGT_va_mask;
kaf24@2663 1849 if ( unlikely(l2_idx >= PGT_va_unknown) )
cl349@3036 1850 {
kaf24@2663 1851 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
cl349@3036 1852 }
kaf24@2663 1853 l2_idx >>= PGT_va_shift;
kaf24@3022 1854
kaf24@3022 1855 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
kaf24@3022 1856 {
kaf24@3022 1857 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
kaf24@3022 1858 domain_crash();
kaf24@3022 1859 }
kaf24@3022 1860
kaf24@2663 1861 /*
kaf24@2663 1862 * Is the L1 p.t. mapped into the current address space? If so we call it
kaf24@2663 1863 * an ACTIVE p.t., otherwise it is INACTIVE.
kaf24@2663 1864 */
kaf24@2663 1865 pl2e = &linear_l2_table[l2_idx];
kaf24@3022 1866 l2e = l2_pgentry_val(*pl2e);
kaf24@3022 1867 which = PTWR_PT_INACTIVE;
kaf24@3022 1868 if ( (l2e >> PAGE_SHIFT) == pfn )
kaf24@3022 1869 {
cl349@3179 1870 /* Check the PRESENT bit to set ACTIVE. */
kaf24@3022 1871 if ( likely(l2e & _PAGE_PRESENT) )
kaf24@3022 1872 which = PTWR_PT_ACTIVE;
cl349@3179 1873 else {
cl349@3179 1874 /*
cl349@3179 1875 * If the PRESENT bit is clear, we may be conflicting with
cl349@3179 1876 * the current ACTIVE p.t. (it may be the same p.t. mapped
cl349@3179 1877 * at another virt addr).
cl349@3179 1878 * The ptwr_flush call below will restore the PRESENT bit.
cl349@3179 1879 */
cl349@3179 1880 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
cl349@3179 1881 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
cl349@3179 1882 which = PTWR_PT_ACTIVE;
cl349@3179 1883 }
kaf24@3022 1884 }
kaf24@2663 1885
kaf24@2663 1886 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
kaf24@2663 1887 "pfn %08lx\n", PTWR_PRINT_WHICH,
kaf24@2663 1888 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
kaf24@2663 1889
kaf24@2663 1890 /*
kaf24@2663 1891 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
kaf24@2663 1892 * time. If there is already one, we must flush it out.
kaf24@2663 1893 */
kaf24@2663 1894 if ( ptwr_info[cpu].ptinfo[which].l1va )
kaf24@2663 1895 ptwr_flush(which);
iap10@2507 1896
kaf24@2663 1897 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
kaf24@2663 1898 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
kaf24@2663 1899
kaf24@2663 1900 /* For safety, disconnect the L1 p.t. page from current space. */
kaf24@2663 1901 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
kaf24@2663 1902 {
kaf24@3022 1903 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
cl349@3325 1904 #if 1
kaf24@2841 1905 flush_tlb(); /* XXX Multi-CPU guests? */
cl349@3036 1906 #else
cl349@3036 1907 flush_tlb_all();
cl349@3036 1908 #endif
cl349@1879 1909 }
kaf24@2663 1910
kaf24@2663 1911 /* Temporarily map the L1 page, and make a copy of it. */
kaf24@2663 1912 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
kaf24@2663 1913 memcpy(ptwr_info[cpu].ptinfo[which].page,
kaf24@2663 1914 ptwr_info[cpu].ptinfo[which].pl1e,
kaf24@2663 1915 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
kaf24@2663 1916
kaf24@2663 1917 /* Finally, make the p.t. page writable by the guest OS. */
kaf24@2663 1918 pte |= _PAGE_RW;
kaf24@2663 1919 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
kaf24@2663 1920 &linear_pg_table[addr>>PAGE_SHIFT], pte);
kaf24@2663 1921 if ( unlikely(__put_user(pte, (unsigned long *)
kaf24@2663 1922 &linear_pg_table[addr>>PAGE_SHIFT])) )
kaf24@2663 1923 {
kaf24@2663 1924 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
kaf24@2663 1925 &linear_pg_table[addr>>PAGE_SHIFT]);
kaf24@2707 1926 /* Toss the writable pagetable state and crash. */
kaf24@2707 1927 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
kaf24@2707 1928 ptwr_info[cpu].ptinfo[which].l1va = 0;
kaf24@2663 1929 domain_crash();
kaf24@2663 1930 }
kaf24@2663 1931
kaf24@3090 1932 return EXCRET_fault_fixed;
cl349@1879 1933 }
cl349@1894 1934
kaf24@2504 1935 static __init int ptwr_init(void)
kaf24@2504 1936 {
kaf24@2504 1937 int i;
kaf24@2504 1938
kaf24@2504 1939 for ( i = 0; i < smp_num_cpus; i++ )
kaf24@2504 1940 {
cl349@2512 1941 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
cl349@2512 1942 (void *)alloc_xenheap_page();
cl349@2512 1943 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
cl349@2512 1944 (void *)alloc_xenheap_page();
kaf24@2504 1945 }
kaf24@2504 1946
kaf24@2504 1947 return 0;
kaf24@2504 1948 }
kaf24@2504 1949 __initcall(ptwr_init);
kaf24@2504 1950
kaf24@2663 1951
kaf24@2663 1952
kaf24@2663 1953
kaf24@2663 1954 /************************************************************************/
kaf24@2663 1955 /************************************************************************/
kaf24@2663 1956 /************************************************************************/
kaf24@2663 1957
cl349@2092 1958 #ifndef NDEBUG
kaf24@2663 1959
cl349@1894 1960 void ptwr_status(void)
cl349@1894 1961 {
cl349@2512 1962 unsigned long pte, *ptep, pfn;
cl349@1894 1963 struct pfn_info *page;
cl349@1894 1964 int cpu = smp_processor_id();
cl349@1894 1965
cl349@2512 1966 ptep = (unsigned long *)&linear_pg_table
cl349@2512 1967 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
kaf24@2237 1968
cl349@2512 1969 if ( __get_user(pte, ptep) ) {
cl349@2512 1970 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
cl349@2495 1971 domain_crash();
cl349@1894 1972 }
cl349@1894 1973
cl349@2495 1974 pfn = pte >> PAGE_SHIFT;
cl349@2495 1975 page = &frame_table[pfn];
cl349@2495 1976 printk("need to alloc l1 page %p\n", page);
cl349@2495 1977 /* make pt page writable */
cl349@2495 1978 printk("need to make read-only l1-page at %p is %08lx\n",
cl349@2512 1979 ptep, pte);
cl349@2495 1980
cl349@2512 1981 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
cl349@1894 1982 return;
cl349@1894 1983
cl349@2512 1984 if ( __get_user(pte, (unsigned long *)
cl349@2512 1985 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
cl349@2491 1986 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
cl349@2512 1987 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
cl349@2491 1988 domain_crash();
cl349@2414 1989 }
cl349@1894 1990 pfn = pte >> PAGE_SHIFT;
cl349@1894 1991 page = &frame_table[pfn];
cl349@1894 1992 }
iap10@2479 1993
kaf24@2637 1994 void audit_domain(struct domain *d)
iap10@2479 1995 {
iap10@2595 1996 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
kaf24@2637 1997
kaf24@2637 1998 void adjust (struct pfn_info *page, int dir, int adjtype)
iap10@2479 1999 {
cl349@2491 2000 int count = page->count_info & PGC_count_mask;
iap10@2479 2001
cl349@2491 2002 if ( adjtype )
cl349@2491 2003 {
cl349@2491 2004 int tcount = page->u.inuse.type_info & PGT_count_mask;
cl349@2491 2005
cl349@2491 2006 ttot++;
iap10@2479 2007
cl349@2491 2008 tcount += dir;
iap10@2479 2009
cl349@2491 2010 if ( tcount < 0 )
cl349@2491 2011 {
cl349@2644 2012 /* This will only come out once. */
kaf24@2637 2013 printk("Audit %d: type count whent below zero pfn=%x "
kaf24@2637 2014 "taf=%x otaf=%x\n",
kaf24@2748 2015 d->id, page-frame_table,
cl349@2491 2016 page->u.inuse.type_info,
cl349@2491 2017 page->tlbflush_timestamp);
cl349@2491 2018 }
cl349@2491 2019
cl349@2491 2020 page->u.inuse.type_info =
iap10@2573 2021 (page->u.inuse.type_info & ~PGT_count_mask) |
cl349@2644 2022 (tcount & PGT_count_mask);
cl349@2491 2023 }
iap10@2479 2024
cl349@2491 2025 ctot++;
cl349@2491 2026 count += dir;
cl349@2491 2027 if ( count < 0 )
cl349@2491 2028 {
cl349@2644 2029 /* This will only come out once. */
kaf24@2637 2030 printk("Audit %d: general count whent below zero pfn=%x "
kaf24@2637 2031 "taf=%x otaf=%x\n",
kaf24@2748 2032 d->id, page-frame_table,
cl349@2491 2033 page->u.inuse.type_info,
cl349@2491 2034 page->tlbflush_timestamp);
cl349@2491 2035 }
cl349@2491 2036
cl349@2491 2037 page->count_info =
iap10@2573 2038 (page->count_info & ~PGC_count_mask) |
cl349@2644 2039 (count & PGC_count_mask);
iap10@2479 2040
iap10@2479 2041 }
iap10@2479 2042
kaf24@2637 2043 void scan_for_pfn(struct domain *d, unsigned long xpfn)
iap10@2479 2044 {
kaf24@2637 2045 unsigned long pfn, *pt;
cl349@2491 2046 struct list_head *list_ent;
kaf24@2637 2047 struct pfn_info *page;
cl349@2491 2048 int i;
iap10@2479 2049
iap10@2479 2050 list_ent = d->page_list.next;
cl349@2491 2051 for ( i = 0; (list_ent != &d->page_list); i++ )
cl349@2491 2052 {
cl349@2491 2053 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2054 page = &frame_table[pfn];
cl349@2491 2055
kaf24@2637 2056 switch ( page->u.inuse.type_info & PGT_type_mask )
cl349@2491 2057 {
kaf24@2637 2058 case PGT_l1_page_table:
kaf24@2637 2059 case PGT_l2_page_table:
kaf24@2637 2060 pt = map_domain_mem(pfn<<PAGE_SHIFT);
cl349@2491 2061 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2637 2062 if ( (pt[i] & _PAGE_PRESENT) &&
kaf24@2637 2063 ((pt[i] >> PAGE_SHIFT) == xpfn) )
kaf24@2637 2064 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
kaf24@2748 2065 d->id, i, pfn, page->u.inuse.type_info,
kaf24@2637 2066 page->count_info);
cl349@2491 2067 unmap_domain_mem(pt);
cl349@2491 2068 }
iap10@2479 2069
cl349@2491 2070 list_ent = frame_table[pfn].list.next;
cl349@2491 2071 }
iap10@2479 2072
iap10@2479 2073 }
iap10@2479 2074
kaf24@2637 2075 void scan_for_pfn_remote(unsigned long xpfn)
iap10@2479 2076 {
cl349@2491 2077 struct domain *e;
cl349@2491 2078 for_each_domain ( e )
cl349@2491 2079 scan_for_pfn( e, xpfn );
iap10@2479 2080 }
iap10@2479 2081
iap10@2479 2082 int i;
iap10@2479 2083 unsigned long pfn;
iap10@2479 2084 struct list_head *list_ent;
kaf24@2637 2085 struct pfn_info *page;
iap10@2479 2086
cl349@3036 2087 if ( d != current->domain )
cl349@2491 2088 domain_pause(d);
iap10@2479 2089 synchronise_pagetables(~0UL);
iap10@2479 2090
iap10@2479 2091 printk("pt base=%lx sh_info=%x\n",
cl349@3036 2092 pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT,
cl349@2491 2093 virt_to_page(d->shared_info)-frame_table);
cl349@2491 2094
iap10@2479 2095 spin_lock(&d->page_alloc_lock);
iap10@2479 2096
kaf24@2637 2097 /* PHASE 0 */
iap10@2479 2098
iap10@2479 2099 list_ent = d->page_list.next;
iap10@2479 2100 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2101 {
cl349@2491 2102 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2103 page = &frame_table[pfn];
iap10@2479 2104
cl349@2491 2105 if ( page->u.inuse.domain != d )
cl349@2491 2106 BUG();
iap10@2479 2107
cl349@2491 2108 if ( (page->u.inuse.type_info & PGT_count_mask) >
cl349@2491 2109 (page->count_info & PGC_count_mask) )
cl349@2491 2110 printk("taf > caf %x %x pfn=%lx\n",
cl349@2491 2111 page->u.inuse.type_info, page->count_info, pfn );
iap10@2479 2112
kaf24@2637 2113 #if 0 /* SYSV shared memory pages plus writeable files. */
cl349@2491 2114 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
cl349@2491 2115 (page->u.inuse.type_info & PGT_count_mask) > 1 )
cl349@2491 2116 {
cl349@2491 2117 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
cl349@2491 2118 pfn,
cl349@2491 2119 page->u.inuse.type_info,
cl349@2491 2120 page->count_info );
cl349@2491 2121 scan_for_pfn_remote(pfn);
cl349@2491 2122 }
cl349@2092 2123 #endif
cl349@2491 2124 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
cl349@2491 2125 (page->u.inuse.type_info & PGT_count_mask) > 1 )
cl349@2491 2126 {
cl349@2491 2127 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
cl349@2491 2128 pfn,
cl349@2491 2129 page->u.inuse.type_info,
cl349@2491 2130 page->count_info );
cl349@2491 2131 }
iap10@2479 2132
kaf24@2637 2133 /* Use tlbflush_timestamp to store original type_info. */
cl349@2491 2134 page->tlbflush_timestamp = page->u.inuse.type_info;
iap10@2479 2135
cl349@2491 2136 list_ent = frame_table[pfn].list.next;
iap10@2479 2137 }
iap10@2479 2138
iap10@2479 2139
kaf24@2637 2140 /* PHASE 1 */
iap10@2479 2141
cl349@3036 2142 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], -1, 1);
iap10@2479 2143
iap10@2479 2144 list_ent = d->page_list.next;
iap10@2479 2145 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2146 {
kaf24@2637 2147 unsigned long *pt;
cl349@2491 2148 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2149 page = &frame_table[pfn];
iap10@2479 2150
cl349@2491 2151 if ( page->u.inuse.domain != d )
cl349@2491 2152 BUG();
iap10@2479 2153
cl349@2491 2154 switch ( page->u.inuse.type_info & PGT_type_mask )
cl349@2491 2155 {
cl349@2491 2156 case PGT_l2_page_table:
iap10@2479 2157
cl349@2491 2158 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
cl349@2491 2159 printk("Audit %d: L2 not validated %x\n",
kaf24@2748 2160 d->id, page->u.inuse.type_info);
iap10@2479 2161
cl349@2491 2162 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
cl349@2491 2163 printk("Audit %d: L2 not pinned %x\n",
kaf24@2748 2164 d->id, page->u.inuse.type_info);
cl349@2491 2165 else
cl349@2491 2166 adjust( page, -1, 1 );
cl349@2491 2167
cl349@2491 2168 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2169
cl349@2491 2170 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
cl349@2491 2171 {
cl349@2491 2172 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2173 {
cl349@2491 2174 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2175 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2176
cl349@2491 2177 if ( l1page->u.inuse.domain != d )
cl349@2491 2178 {
kaf24@2637 2179 printk("L2: Skip bizarre page belonging to other "
kaf24@2637 2180 "dom %p\n", l1page->u.inuse.domain);
cl349@2491 2181 continue;
cl349@2491 2182 }
kaf24@2637 2183
kaf24@2637 2184 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
kaf24@2637 2185 PGT_l2_page_table )
kaf24@2637 2186 printk("Audit %d: [%x] Found %s Linear PT "
kaf24@2748 2187 "t=%x pfn=%lx\n", d->id, i,
kaf24@2637 2188 (l1pfn==pfn) ? "Self" : "Other",
kaf24@2637 2189 l1page->u.inuse.type_info,
kaf24@2637 2190 l1pfn);
kaf24@2637 2191 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
kaf24@2637 2192 PGT_l1_page_table )
kaf24@2637 2193 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
kaf24@2748 2194 d->id, i,
kaf24@2637 2195 l1page->u.inuse.type_info,
kaf24@2637 2196 l1pfn);
iap10@2479 2197
kaf24@2637 2198 adjust(l1page, -1, 1);
cl349@2491 2199 }
cl349@2491 2200 }
iap10@2479 2201
cl349@2491 2202 unmap_domain_mem(pt);
iap10@2479 2203
cl349@2491 2204 break;
iap10@2479 2205
iap10@2479 2206
cl349@2491 2207 case PGT_l1_page_table:
cl349@2491 2208
cl349@2491 2209 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
cl349@2491 2210 adjust( page, -1, 1 );
iap10@2479 2211
cl349@2491 2212 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
cl349@2491 2213 printk("Audit %d: L1 not validated %x\n",
kaf24@2748 2214 d->id, page->u.inuse.type_info);
iap10@2479 2215 #if 0
cl349@2491 2216 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
cl349@2491 2217 printk("Audit %d: L1 not pinned %x\n",
kaf24@2748 2218 d->id, page->u.inuse.type_info);
iap10@2479 2219 #endif
cl349@2491 2220 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2221
cl349@2491 2222 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
cl349@2491 2223 {
cl349@2491 2224 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2225 {
cl349@2491 2226 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2227 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2228
cl349@2644 2229 if ( l1pfn < 0x100 )
cl349@2644 2230 {
cl349@2644 2231 lowmem_mappings++;
cl349@2644 2232 continue;
cl349@2644 2233 }
iap10@2595 2234
cl349@2644 2235 if ( l1pfn > max_page )
cl349@2644 2236 {
cl349@2644 2237 io_mappings++;
cl349@2644 2238 continue;
cl349@2644 2239 }
iap10@2595 2240
cl349@2491 2241 if ( pt[i] & _PAGE_RW )
cl349@2491 2242 {
iap10@2479 2243
cl349@2491 2244 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
cl349@2491 2245 PGT_l1_page_table ||
cl349@2491 2246 (l1page->u.inuse.type_info & PGT_type_mask) ==
cl349@2491 2247 PGT_l2_page_table )
cl349@2491 2248 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
kaf24@2748 2249 d->id, i,
cl349@2491 2250 l1page->u.inuse.type_info,
cl349@2491 2251 l1pfn);
iap10@2479 2252
cl349@2491 2253 }
iap10@2479 2254
cl349@2491 2255 if ( l1page->u.inuse.domain != d )
cl349@2491 2256 {
kaf24@2637 2257 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
kaf24@2637 2258 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
kaf24@2748 2259 d->id, pfn, i,
iap10@2595 2260 (unsigned long)l1page->u.inuse.domain,
cl349@2644 2261 l1pfn,
cl349@2644 2262 l1page->count_info,
cl349@2644 2263 l1page->u.inuse.type_info,
cl349@2644 2264 machine_to_phys_mapping[l1pfn]);
cl349@2491 2265 continue;
cl349@2491 2266 }
iap10@2479 2267
kaf24@2637 2268 adjust(l1page, -1, 0);
cl349@2491 2269 }
cl349@2491 2270 }
iap10@2479 2271
cl349@2491 2272 unmap_domain_mem(pt);
iap10@2479 2273
cl349@2491 2274 break;
iap10@2595 2275 }
iap10@2479 2276
cl349@2491 2277 list_ent = frame_table[pfn].list.next;
iap10@2479 2278 }
iap10@2479 2279
kaf24@2637 2280 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
cl349@2644 2281 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
kaf24@2748 2282 d->id, lowmem_mappings, io_mappings);
iap10@2595 2283
kaf24@2637 2284 /* PHASE 2 */
iap10@2479 2285
iap10@2479 2286 ctot = ttot = 0;
iap10@2479 2287 list_ent = d->page_list.next;
iap10@2479 2288 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2289 {
cl349@2491 2290 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2291 page = &frame_table[pfn];
iap10@2479 2292
cl349@2491 2293 switch ( page->u.inuse.type_info & PGT_type_mask)
cl349@2491 2294 {
cl349@2491 2295 case PGT_l1_page_table:
cl349@2491 2296 case PGT_l2_page_table:
cl349@2491 2297 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
cl349@2491 2298 {
cl349@2491 2299 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
kaf24@2748 2300 d->id, page->u.inuse.type_info,
cl349@2491 2301 page->tlbflush_timestamp,
cl349@2491 2302 page->count_info, pfn );
cl349@2491 2303 scan_for_pfn_remote(pfn);
cl349@2491 2304 }
cl349@2491 2305 default:
cl349@2491 2306 if ( (page->count_info & PGC_count_mask) != 1 )
cl349@2491 2307 {
kaf24@2637 2308 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
kaf24@2748 2309 d->id,
cl349@2491 2310 page->count_info,
cl349@2491 2311 page->u.inuse.type_info,
cl349@2491 2312 page->tlbflush_timestamp, pfn );
cl349@2491 2313 scan_for_pfn_remote(pfn);
cl349@2491 2314 }
cl349@2491 2315 break;
cl349@2491 2316 }
iap10@2479 2317
cl349@2491 2318 list_ent = frame_table[pfn].list.next;
iap10@2479 2319 }
iap10@2479 2320
kaf24@2637 2321 /* PHASE 3 */
iap10@2479 2322
iap10@2479 2323 list_ent = d->page_list.next;
iap10@2479 2324 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2325 {
kaf24@2637 2326 unsigned long *pt;
cl349@2491 2327 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2328 page = &frame_table[pfn];
iap10@2479 2329
cl349@2491 2330 switch ( page->u.inuse.type_info & PGT_type_mask )
cl349@2491 2331 {
cl349@2491 2332 case PGT_l2_page_table:
cl349@2491 2333 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
cl349@2491 2334 adjust( page, 1, 1 );
iap10@2479 2335
cl349@2491 2336 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2337
cl349@2491 2338 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
cl349@2491 2339 {
cl349@2491 2340 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2341 {
cl349@2491 2342 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2343 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2344
cl349@2491 2345 if ( l1page->u.inuse.domain == d)
kaf24@2637 2346 adjust(l1page, 1, 1);
cl349@2491 2347 }
cl349@2491 2348 }
iap10@2479 2349
cl349@2491 2350 unmap_domain_mem(pt);
cl349@2491 2351 break;
iap10@2479 2352
cl349@2491 2353 case PGT_l1_page_table:
cl349@2491 2354 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
cl349@2491 2355 adjust( page, 1, 1 );
iap10@2479 2356
cl349@2491 2357 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2358
cl349@2491 2359 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
cl349@2491 2360 {
cl349@2491 2361 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2362 {
cl349@2491 2363 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2364 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2365
kaf24@2637 2366 if ( (l1page->u.inuse.domain != d) ||
kaf24@2637 2367 (l1pfn < 0x100) || (l1pfn > max_page) )
kaf24@2637 2368 continue;
iap10@2595 2369
cl349@2644 2370 adjust(l1page, 1, 0);
cl349@2491 2371 }
cl349@2491 2372 }
iap10@2479 2373
cl349@2491 2374 unmap_domain_mem(pt);
cl349@2491 2375 break;
cl349@2491 2376 }
iap10@2479 2377
iap10@2479 2378
kaf24@2637 2379 page->tlbflush_timestamp = 0;
iap10@2479 2380
cl349@2491 2381 list_ent = frame_table[pfn].list.next;
iap10@2479 2382 }
iap10@2479 2383
iap10@2479 2384 spin_unlock(&d->page_alloc_lock);
iap10@2479 2385
cl349@3036 2386 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], 1, 1);
iap10@2479 2387
kaf24@2748 2388 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
iap10@2479 2389
cl349@3036 2390 if ( d != current->domain )
cl349@2491 2391 domain_unpause(d);
iap10@2479 2392 }
iap10@2479 2393
cl349@2491 2394 void audit_domains(void)
iap10@2479 2395 {
iap10@2479 2396 struct domain *d;
iap10@2479 2397 for_each_domain ( d )
cl349@2644 2398 audit_domain(d);
iap10@2479 2399 }
iap10@2479 2400
kaf24@2842 2401 void audit_domains_key(unsigned char key)
iap10@2479 2402 {
kaf24@2842 2403 audit_domains();
iap10@2479 2404 }
iap10@2479 2405
iap10@2479 2406 #endif