debuggers.hg

annotate xen/arch/x86/memory.c @ 3620:6d98eb831816

bitkeeper revision 1.1159.212.52 (41fa6980PfhDt-hKCfacnyHcFB7DNQ)

Make page allocator 64-bit safe.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@scramble.cl.cam.ac.uk
date Fri Jan 28 16:34:08 2005 +0000 (2005-01-28)
parents eef1949801b8
children fec8b1778268
rev   line source
djm@1749 1 /******************************************************************************
djm@1749 2 * arch/x86/memory.c
djm@1749 3 *
djm@1749 4 * Copyright (c) 2002-2004 K A Fraser
cl349@2093 5 * Copyright (c) 2004 Christian Limpach
djm@1749 6 *
djm@1749 7 * This program is free software; you can redistribute it and/or modify
djm@1749 8 * it under the terms of the GNU General Public License as published by
djm@1749 9 * the Free Software Foundation; either version 2 of the License, or
djm@1749 10 * (at your option) any later version.
djm@1749 11 *
djm@1749 12 * This program is distributed in the hope that it will be useful,
djm@1749 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
djm@1749 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
djm@1749 15 * GNU General Public License for more details.
djm@1749 16 *
djm@1749 17 * You should have received a copy of the GNU General Public License
djm@1749 18 * along with this program; if not, write to the Free Software
djm@1749 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
djm@1749 20 */
djm@1749 21
djm@1749 22 /*
djm@1749 23 * A description of the x86 page table API:
djm@1749 24 *
djm@1749 25 * Domains trap to do_mmu_update with a list of update requests.
djm@1749 26 * This is a list of (ptr, val) pairs, where the requested operation
djm@1749 27 * is *ptr = val.
djm@1749 28 *
djm@1749 29 * Reference counting of pages:
djm@1749 30 * ----------------------------
djm@1749 31 * Each page has two refcounts: tot_count and type_count.
djm@1749 32 *
djm@1749 33 * TOT_COUNT is the obvious reference count. It counts all uses of a
djm@1749 34 * physical page frame by a domain, including uses as a page directory,
djm@1749 35 * a page table, or simple mappings via a PTE. This count prevents a
djm@1749 36 * domain from releasing a frame back to the free pool when it still holds
djm@1749 37 * a reference to it.
djm@1749 38 *
djm@1749 39 * TYPE_COUNT is more subtle. A frame can be put to one of three
djm@1749 40 * mutually-exclusive uses: it might be used as a page directory, or a
kaf24@2375 41 * page table, or it may be mapped writable by the domain [of course, a
djm@1749 42 * frame may not be used in any of these three ways!].
djm@1749 43 * So, type_count is a count of the number of times a frame is being
djm@1749 44 * referred to in its current incarnation. Therefore, a page can only
djm@1749 45 * change its type when its type count is zero.
djm@1749 46 *
djm@1749 47 * Pinning the page type:
djm@1749 48 * ----------------------
djm@1749 49 * The type of a page can be pinned/unpinned with the commands
djm@1749 50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
djm@1749 51 * pinning is not reference counted, so it can't be nested).
djm@1749 52 * This is useful to prevent a page's type count falling to zero, at which
djm@1749 53 * point safety checks would need to be carried out next time the count
djm@1749 54 * is increased again.
djm@1749 55 *
kaf24@2375 56 * A further note on writable page mappings:
kaf24@2375 57 * -----------------------------------------
kaf24@2375 58 * For simplicity, the count of writable mappings for a page may not
kaf24@2375 59 * correspond to reality. The 'writable count' is incremented for every
djm@1749 60 * PTE which maps the page with the _PAGE_RW flag set. However, for
djm@1749 61 * write access to be possible the page directory entry must also have
djm@1749 62 * its _PAGE_RW bit set. We do not check this as it complicates the
djm@1749 63 * reference counting considerably [consider the case of multiple
djm@1749 64 * directory entries referencing a single page table, some with the RW
djm@1749 65 * bit set, others not -- it starts getting a bit messy].
djm@1749 66 * In normal use, this simplification shouldn't be a problem.
djm@1749 67 * However, the logic can be added if required.
djm@1749 68 *
djm@1749 69 * One more note on read-only page mappings:
djm@1749 70 * -----------------------------------------
djm@1749 71 * We want domains to be able to map pages for read-only access. The
djm@1749 72 * main reason is that page tables and directories should be readable
kaf24@2375 73 * by a domain, but it would not be safe for them to be writable.
djm@1749 74 * However, domains have free access to rings 1 & 2 of the Intel
djm@1749 75 * privilege model. In terms of page protection, these are considered
djm@1749 76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
djm@1749 77 * read-only restrictions are respected in supervisor mode -- if the
kaf24@2375 78 * bit is clear then any mapped page is writable.
djm@1749 79 *
djm@1749 80 * We get round this by always setting the WP bit and disallowing
djm@1749 81 * updates to it. This is very unlikely to cause a problem for guest
djm@1749 82 * OS's, which will generally use the WP bit to simplify copy-on-write
djm@1749 83 * implementation (in that case, OS wants a fault when it writes to
djm@1749 84 * an application-supplied buffer).
djm@1749 85 */
djm@1749 86
djm@1749 87 #include <xen/config.h>
djm@1749 88 #include <xen/init.h>
kaf24@3392 89 #include <xen/kernel.h>
djm@1749 90 #include <xen/lib.h>
djm@1749 91 #include <xen/mm.h>
djm@1749 92 #include <xen/sched.h>
djm@1749 93 #include <xen/errno.h>
djm@1749 94 #include <xen/perfc.h>
djm@1749 95 #include <xen/irq.h>
iap10@2479 96 #include <xen/softirq.h>
kaf24@1787 97 #include <asm/shadow.h>
djm@1749 98 #include <asm/page.h>
djm@1749 99 #include <asm/flushtlb.h>
djm@1749 100 #include <asm/io.h>
djm@1749 101 #include <asm/uaccess.h>
djm@1749 102 #include <asm/domain_page.h>
djm@1749 103 #include <asm/ldt.h>
djm@1749 104
kaf24@2097 105 #ifdef VERBOSE
djm@1749 106 #define MEM_LOG(_f, _a...) \
djm@1749 107 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
cl349@2957 108 current->domain->id , __LINE__ , ## _a )
djm@1749 109 #else
djm@1749 110 #define MEM_LOG(_f, _a...) ((void)0)
djm@1749 111 #endif
djm@1749 112
djm@1749 113 static int alloc_l2_table(struct pfn_info *page);
djm@1749 114 static int alloc_l1_table(struct pfn_info *page);
djm@1749 115 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
djm@1749 116 static int get_page_and_type_from_pagenr(unsigned long page_nr,
djm@1749 117 u32 type,
djm@1749 118 struct domain *d);
djm@1749 119
djm@1749 120 static void free_l2_table(struct pfn_info *page);
djm@1749 121 static void free_l1_table(struct pfn_info *page);
djm@1749 122
djm@1749 123 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
djm@1749 124 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
djm@1749 125
djm@1749 126 /* Used to defer flushing of memory structures. */
djm@1749 127 static struct {
djm@1749 128 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
djm@1749 129 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
kaf24@3187 130 unsigned long deferred_ops;
kaf24@2314 131 /* If non-NULL, specifies a foreign subject domain for some operations. */
kaf24@3187 132 struct domain *foreign;
kaf24@3113 133 } __cacheline_aligned percpu_info[NR_CPUS];
djm@1749 134
kaf24@2314 135 /*
kaf24@2314 136 * Returns the current foreign domain; defaults to the currently-executing
kaf24@2314 137 * domain if a foreign override hasn't been specified.
kaf24@2314 138 */
cl349@2957 139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
djm@1749 140
kaf24@2336 141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
kaf24@2336 142 static struct domain *dom_xen, *dom_io;
cl349@2227 143
kaf24@3392 144 /* Frame table and its size in pages. */
kaf24@3392 145 struct pfn_info *frame_table;
kaf24@3392 146 unsigned long frame_table_size;
kaf24@3392 147 unsigned long max_page;
kaf24@3392 148
kaf24@3392 149 void __init init_frametable(void)
kaf24@3392 150 {
kaf24@3620 151 #if defined(__i386__)
kaf24@3392 152 unsigned long i, p;
kaf24@3620 153 #endif
kaf24@3392 154
kaf24@3392 155 frame_table_size = max_page * sizeof(struct pfn_info);
kaf24@3392 156 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
kaf24@3392 157
kaf24@3620 158 #if defined(__x86_64__)
kaf24@3620 159 frame_table = __va(alloc_boot_pages(frame_table_size, 4UL << 20));
kaf24@3620 160 #elif defined(__i386__)
kaf24@3620 161 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
kaf24@3620 162
kaf24@3392 163 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
kaf24@3392 164 {
kaf24@3392 165 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
kaf24@3392 166 if ( p == 0 )
kaf24@3392 167 panic("Not enough memory for frame table\n");
kaf24@3392 168 idle_pg_table[(FRAMETABLE_VIRT_START + i) >> L2_PAGETABLE_SHIFT] =
kaf24@3392 169 mk_l2_pgentry(p | __PAGE_HYPERVISOR | _PAGE_PSE);
kaf24@3392 170 }
kaf24@3620 171 #endif
kaf24@3392 172
kaf24@3392 173 memset(frame_table, 0, frame_table_size);
kaf24@3392 174 }
kaf24@3392 175
cl349@2227 176 void arch_init_memory(void)
djm@1749 177 {
sos22@3478 178 unsigned long i;
kaf24@2336 179
kaf24@2384 180 /*
kaf24@2384 181 * We are rather picky about the layout of 'struct pfn_info'. The
kaf24@2384 182 * count_info and domain fields must be adjacent, as we perform atomic
kaf24@2384 183 * 64-bit operations on them. Also, just for sanity, we assert the size
kaf24@2384 184 * of the structure here.
kaf24@2384 185 */
kaf24@2384 186 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
kaf24@2384 187 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
kaf24@2384 188 (sizeof(struct pfn_info) != 24) )
kaf24@2384 189 {
kaf24@2384 190 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
kaf24@2384 191 offsetof(struct pfn_info, count_info),
kaf24@2384 192 offsetof(struct pfn_info, u.inuse.domain),
kaf24@2384 193 sizeof(struct pfn_info));
kaf24@2384 194 for ( ; ; ) ;
kaf24@2384 195 }
kaf24@2384 196
djm@1749 197 memset(percpu_info, 0, sizeof(percpu_info));
cl349@2227 198
kaf24@2336 199 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
kaf24@2336 200 memset(machine_to_phys_mapping, 0x55, 4<<20);
kaf24@2336 201
kaf24@2336 202 /*
kaf24@2336 203 * Initialise our DOMID_XEN domain.
kaf24@2336 204 * Any Xen-heap pages that we will allow to be mapped will have
kaf24@2336 205 * their domain field set to dom_xen.
kaf24@2336 206 */
kaf24@2336 207 dom_xen = alloc_domain_struct();
kaf24@2336 208 atomic_set(&dom_xen->refcnt, 1);
kaf24@2748 209 dom_xen->id = DOMID_XEN;
kaf24@2336 210
kaf24@2336 211 /*
kaf24@2336 212 * Initialise our DOMID_IO domain.
kaf24@2336 213 * This domain owns no pages but is considered a special case when
kaf24@2336 214 * mapping I/O pages, as the mappings occur at the priv of the caller.
kaf24@2336 215 */
kaf24@2336 216 dom_io = alloc_domain_struct();
kaf24@2336 217 atomic_set(&dom_io->refcnt, 1);
kaf24@2748 218 dom_io->id = DOMID_IO;
kaf24@2336 219
kaf24@2336 220 /* M2P table is mappable read-only by privileged domains. */
kaf24@3392 221 for ( i = 0; i < 1024; i++ )
kaf24@2336 222 {
sos22@3478 223 frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
sos22@3478 224 /* gdt to make sure it's only mapped read-only by non-privileged
sos22@3478 225 domains. */
sos22@3478 226 frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
sos22@3478 227 frame_table[m2p_start_mfn+i].u.inuse.domain = dom_xen;
kaf24@2336 228 }
djm@1749 229 }
djm@1749 230
cl349@2957 231 static void __invalidate_shadow_ldt(struct exec_domain *d)
djm@1749 232 {
djm@1749 233 int i;
djm@1749 234 unsigned long pfn;
djm@1749 235 struct pfn_info *page;
djm@1749 236
djm@1749 237 d->mm.shadow_ldt_mapcnt = 0;
djm@1749 238
djm@1749 239 for ( i = 16; i < 32; i++ )
djm@1749 240 {
cl349@3036 241 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_ptes[i]);
djm@1749 242 if ( pfn == 0 ) continue;
cl349@3036 243 d->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
djm@1749 244 page = &frame_table[pfn];
djm@1749 245 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
cl349@3036 246 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
djm@1749 247 put_page_and_type(page);
djm@1749 248 }
djm@1749 249
djm@1749 250 /* Dispose of the (now possibly invalid) mappings from the TLB. */
djm@1749 251 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
djm@1749 252 }
djm@1749 253
djm@1749 254
cl349@2957 255 static inline void invalidate_shadow_ldt(struct exec_domain *d)
djm@1749 256 {
djm@1749 257 if ( d->mm.shadow_ldt_mapcnt != 0 )
djm@1749 258 __invalidate_shadow_ldt(d);
djm@1749 259 }
djm@1749 260
djm@1749 261
kaf24@2336 262 static int alloc_segdesc_page(struct pfn_info *page)
djm@1749 263 {
djm@1749 264 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
djm@1749 265 int i;
djm@1749 266
djm@1749 267 for ( i = 0; i < 512; i++ )
kaf24@1854 268 if ( unlikely(!check_descriptor(&descs[i*2])) )
djm@1749 269 goto fail;
djm@1749 270
djm@1749 271 unmap_domain_mem(descs);
djm@1749 272 return 1;
djm@1749 273
djm@1749 274 fail:
djm@1749 275 unmap_domain_mem(descs);
djm@1749 276 return 0;
djm@1749 277 }
djm@1749 278
djm@1749 279
djm@1749 280 /* Map shadow page at offset @off. */
djm@1749 281 int map_ldt_shadow_page(unsigned int off)
djm@1749 282 {
cl349@2957 283 struct exec_domain *ed = current;
cl349@2957 284 struct domain *d = ed->domain;
djm@1749 285 unsigned long l1e;
djm@1749 286
djm@1749 287 if ( unlikely(in_irq()) )
djm@1749 288 BUG();
djm@1749 289
cl349@2957 290 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->mm.ldt_base >>
djm@1749 291 PAGE_SHIFT) + off]);
djm@1749 292
djm@1749 293 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
djm@1749 294 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
djm@1749 295 d, PGT_ldt_page)) )
djm@1749 296 return 0;
djm@1749 297
cl349@3036 298 ed->mm.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
cl349@2957 299 ed->mm.shadow_ldt_mapcnt++;
djm@1749 300
djm@1749 301 return 1;
djm@1749 302 }
djm@1749 303
djm@1749 304
djm@1749 305 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
djm@1749 306 {
djm@1749 307 struct pfn_info *page = &frame_table[page_nr];
djm@1749 308
djm@1749 309 if ( unlikely(!pfn_is_ram(page_nr)) )
djm@1749 310 {
djm@1749 311 MEM_LOG("Pfn %08lx is not RAM", page_nr);
djm@1749 312 return 0;
djm@1749 313 }
djm@1749 314
djm@1749 315 if ( unlikely(!get_page(page, d)) )
djm@1749 316 {
djm@1749 317 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
djm@1749 318 return 0;
djm@1749 319 }
djm@1749 320
djm@1749 321 return 1;
djm@1749 322 }
djm@1749 323
djm@1749 324
djm@1749 325 static int get_page_and_type_from_pagenr(unsigned long page_nr,
djm@1749 326 u32 type,
djm@1749 327 struct domain *d)
djm@1749 328 {
djm@1749 329 struct pfn_info *page = &frame_table[page_nr];
djm@1749 330
djm@1749 331 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
djm@1749 332 return 0;
djm@1749 333
djm@1749 334 if ( unlikely(!get_page_type(page, type)) )
djm@1749 335 {
cl349@2450 336 #ifdef VERBOSE
cl349@2491 337 if ( (type & PGT_type_mask) != PGT_l1_page_table )
cl349@2491 338 MEM_LOG("Bad page type for pfn %08lx (%08x)",
cl349@2491 339 page_nr, page->u.inuse.type_info);
cl349@2450 340 #endif
djm@1749 341 put_page(page);
djm@1749 342 return 0;
djm@1749 343 }
djm@1749 344
djm@1749 345 return 1;
djm@1749 346 }
djm@1749 347
djm@1749 348
djm@1749 349 /*
djm@1749 350 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
djm@1749 351 * needs some special care with reference counst and access permissions:
djm@1749 352 * 1. The mapping entry must be read-only, or the guest may get write access
djm@1749 353 * to its own PTEs.
djm@1749 354 * 2. We must only bump the reference counts for an *already validated*
djm@1749 355 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
djm@1749 356 * on a validation that is required to complete that validation.
djm@1749 357 * 3. We only need to increment the reference counts for the mapped page
djm@1749 358 * frame if it is mapped by a different L2 table. This is sufficient and
djm@1749 359 * also necessary to allow validation of an L2 table mapping itself.
djm@1749 360 */
kaf24@2314 361 static int
kaf24@2314 362 get_linear_pagetable(
kaf24@2314 363 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
djm@1749 364 {
djm@1749 365 u32 x, y;
djm@1749 366 struct pfn_info *page;
djm@1749 367
djm@1749 368 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
djm@1749 369 {
djm@1749 370 MEM_LOG("Attempt to create linear p.t. with write perms");
djm@1749 371 return 0;
djm@1749 372 }
djm@1749 373
djm@1749 374 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
djm@1749 375 {
djm@1749 376 /* Make sure the mapped frame belongs to the correct domain. */
kaf24@2314 377 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
djm@1749 378 return 0;
djm@1749 379
djm@1749 380 /*
djm@1749 381 * Make sure that the mapped frame is an already-validated L2 table.
djm@1749 382 * If so, atomically increment the count (checking for overflow).
djm@1749 383 */
djm@1749 384 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
kaf24@1970 385 y = page->u.inuse.type_info;
djm@1749 386 do {
djm@1749 387 x = y;
djm@1749 388 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
djm@1749 389 unlikely((x & (PGT_type_mask|PGT_validated)) !=
djm@1749 390 (PGT_l2_page_table|PGT_validated)) )
djm@1749 391 {
djm@1749 392 put_page(page);
djm@1749 393 return 0;
djm@1749 394 }
djm@1749 395 }
kaf24@1970 396 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
djm@1749 397 }
djm@1749 398
djm@1749 399 return 1;
djm@1749 400 }
djm@1749 401
djm@1749 402
kaf24@2314 403 static int
kaf24@2314 404 get_page_from_l1e(
kaf24@2314 405 l1_pgentry_t l1e, struct domain *d)
djm@1749 406 {
djm@1749 407 unsigned long l1v = l1_pgentry_val(l1e);
djm@1749 408 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
kaf24@2382 409 struct pfn_info *page = &frame_table[pfn];
djm@1749 410 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
djm@1749 411
djm@1749 412 if ( !(l1v & _PAGE_PRESENT) )
djm@1749 413 return 1;
djm@1749 414
djm@1749 415 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
djm@1749 416 {
djm@1749 417 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
djm@1749 418 return 0;
djm@1749 419 }
djm@1749 420
djm@1749 421 if ( unlikely(!pfn_is_ram(pfn)) )
djm@1749 422 {
kaf24@2336 423 /* Revert to caller privileges if FD == DOMID_IO. */
kaf24@2336 424 if ( d == dom_io )
cl349@2957 425 d = current->domain;
kaf24@2336 426
kaf24@2336 427 if ( IS_PRIV(d) )
djm@1749 428 return 1;
djm@1749 429
kaf24@2336 430 if ( IS_CAPABLE_PHYSDEV(d) )
kaf24@2336 431 return domain_iomem_in_pfn(d, pfn);
djm@1749 432
djm@1749 433 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
djm@1749 434 return 0;
djm@1749 435 }
djm@1749 436
kaf24@2756 437 return ((l1v & _PAGE_RW) ?
kaf24@2756 438 get_page_and_type(page, d, PGT_writable_page) :
kaf24@2757 439 get_page(page, d));
djm@1749 440 }
djm@1749 441
djm@1749 442
djm@1749 443 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
kaf24@2314 444 static int
kaf24@2314 445 get_page_from_l2e(
kaf24@2466 446 l2_pgentry_t l2e, unsigned long pfn,
kaf24@2466 447 struct domain *d, unsigned long va_idx)
djm@1749 448 {
iap10@2458 449 int rc;
iap10@2458 450
djm@1749 451 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
djm@1749 452 return 1;
djm@1749 453
djm@1749 454 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
djm@1749 455 {
djm@1749 456 MEM_LOG("Bad L2 page type settings %04lx",
djm@1749 457 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
djm@1749 458 return 0;
djm@1749 459 }
djm@1749 460
iap10@2458 461 rc = get_page_and_type_from_pagenr(
iap10@2458 462 l2_pgentry_to_pagenr(l2e),
kaf24@2466 463 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
iap10@2458 464
iap10@2458 465 if ( unlikely(!rc) )
kaf24@2314 466 return get_linear_pagetable(l2e, pfn, d);
djm@1749 467
djm@1749 468 return 1;
djm@1749 469 }
djm@1749 470
djm@1749 471
kaf24@2382 472 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
djm@1749 473 {
djm@1749 474 unsigned long l1v = l1_pgentry_val(l1e);
kaf24@2385 475 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
kaf24@2385 476 struct pfn_info *page = &frame_table[pfn];
iap10@3424 477 struct domain *e;
djm@1749 478
kaf24@2385 479 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
djm@1749 480 return;
djm@1749 481
iap10@3424 482 e = page->u.inuse.domain;
kaf24@2382 483 if ( unlikely(e != d) )
kaf24@2382 484 {
kaf24@2382 485 /*
kaf24@2382 486 * Unmap a foreign page that may have been mapped via a grant table.
kaf24@2382 487 * Note that this can fail for a privileged domain that can map foreign
kaf24@2382 488 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
kaf24@2382 489 * counted via a grant entry and some counted directly in the page
kaf24@2382 490 * structure's reference count. Note that reference counts won't get
kaf24@2382 491 * dangerously confused as long as we always try to decrement the
kaf24@2382 492 * grant entry first. We may end up with a mismatch between which
kaf24@2382 493 * mappings and which unmappings are counted via the grant entry, but
kaf24@2382 494 * really it doesn't matter as privileged domains have carte blanche.
kaf24@2382 495 */
kaf24@2655 496 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
kaf24@2382 497 return;
kaf24@2382 498 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
kaf24@2382 499 }
kaf24@2382 500
djm@1749 501 if ( l1v & _PAGE_RW )
djm@1749 502 {
djm@1749 503 put_page_and_type(page);
djm@1749 504 }
djm@1749 505 else
djm@1749 506 {
djm@1749 507 /* We expect this is rare so we blow the entire shadow LDT. */
kaf24@1970 508 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
djm@1749 509 PGT_ldt_page)) &&
kaf24@1970 510 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
cl349@2957 511 invalidate_shadow_ldt(e->exec_domain[0]);
djm@1749 512 put_page(page);
djm@1749 513 }
djm@1749 514 }
djm@1749 515
djm@1749 516
djm@1749 517 /*
djm@1749 518 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
djm@1749 519 * Note also that this automatically deals correctly with linear p.t.'s.
djm@1749 520 */
djm@1749 521 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
djm@1749 522 {
djm@1749 523 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
djm@1749 524 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
djm@1749 525 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
djm@1749 526 }
djm@1749 527
djm@1749 528
djm@1749 529 static int alloc_l2_table(struct pfn_info *page)
djm@1749 530 {
kaf24@2314 531 struct domain *d = page->u.inuse.domain;
kaf24@2314 532 unsigned long page_nr = page_to_pfn(page);
kaf24@2314 533 l2_pgentry_t *pl2e;
kaf24@2314 534 int i;
djm@1749 535
djm@1749 536 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 537
kaf24@3392 538 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
iap10@2458 539 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
djm@1749 540 goto fail;
kaf24@3392 541
djm@1749 542 #if defined(__i386__)
djm@1749 543 /* Now we add our private high mappings. */
djm@1749 544 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
djm@1749 545 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
djm@1749 546 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
djm@1749 547 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
djm@1749 548 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
djm@1749 549 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
cl349@3036 550 mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) |
djm@1749 551 __PAGE_HYPERVISOR);
djm@1749 552 #endif
djm@1749 553
djm@1749 554 unmap_domain_mem(pl2e);
djm@1749 555 return 1;
djm@1749 556
djm@1749 557 fail:
djm@1749 558 while ( i-- > 0 )
djm@1749 559 put_page_from_l2e(pl2e[i], page_nr);
djm@1749 560
djm@1749 561 unmap_domain_mem(pl2e);
djm@1749 562 return 0;
djm@1749 563 }
djm@1749 564
djm@1749 565
djm@1749 566 static int alloc_l1_table(struct pfn_info *page)
djm@1749 567 {
kaf24@2314 568 struct domain *d = page->u.inuse.domain;
kaf24@2314 569 unsigned long page_nr = page_to_pfn(page);
kaf24@2314 570 l1_pgentry_t *pl1e;
kaf24@2314 571 int i;
djm@1749 572
djm@1749 573 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 574
djm@1749 575 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2314 576 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
djm@1749 577 goto fail;
djm@1749 578
djm@1749 579 unmap_domain_mem(pl1e);
djm@1749 580 return 1;
djm@1749 581
djm@1749 582 fail:
djm@1749 583 while ( i-- > 0 )
kaf24@2382 584 put_page_from_l1e(pl1e[i], d);
djm@1749 585
djm@1749 586 unmap_domain_mem(pl1e);
djm@1749 587 return 0;
djm@1749 588 }
djm@1749 589
djm@1749 590
djm@1749 591 static void free_l2_table(struct pfn_info *page)
djm@1749 592 {
djm@1749 593 unsigned long page_nr = page - frame_table;
djm@1749 594 l2_pgentry_t *pl2e;
djm@1749 595 int i;
djm@1749 596
djm@1749 597 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 598
djm@1749 599 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
djm@1749 600 put_page_from_l2e(pl2e[i], page_nr);
djm@1749 601
djm@1749 602 unmap_domain_mem(pl2e);
djm@1749 603 }
djm@1749 604
djm@1749 605
djm@1749 606 static void free_l1_table(struct pfn_info *page)
djm@1749 607 {
kaf24@2382 608 struct domain *d = page->u.inuse.domain;
djm@1749 609 unsigned long page_nr = page - frame_table;
djm@1749 610 l1_pgentry_t *pl1e;
djm@1749 611 int i;
djm@1749 612
djm@1749 613 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 614
djm@1749 615 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2382 616 put_page_from_l1e(pl1e[i], d);
djm@1749 617
djm@1749 618 unmap_domain_mem(pl1e);
djm@1749 619 }
djm@1749 620
djm@1749 621
djm@1749 622 static inline int update_l2e(l2_pgentry_t *pl2e,
djm@1749 623 l2_pgentry_t ol2e,
djm@1749 624 l2_pgentry_t nl2e)
djm@1749 625 {
djm@1749 626 unsigned long o = cmpxchg((unsigned long *)pl2e,
djm@1749 627 l2_pgentry_val(ol2e),
djm@1749 628 l2_pgentry_val(nl2e));
djm@1749 629 if ( o != l2_pgentry_val(ol2e) )
djm@1749 630 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
djm@1749 631 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
djm@1749 632 return (o == l2_pgentry_val(ol2e));
djm@1749 633 }
djm@1749 634
djm@1749 635
djm@1749 636 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
djm@1749 637 static int mod_l2_entry(l2_pgentry_t *pl2e,
djm@1749 638 l2_pgentry_t nl2e,
djm@1749 639 unsigned long pfn)
djm@1749 640 {
djm@1749 641 l2_pgentry_t ol2e;
djm@1749 642 unsigned long _ol2e;
djm@1749 643
djm@1749 644 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
djm@1749 645 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
djm@1749 646 {
djm@1749 647 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
djm@1749 648 return 0;
djm@1749 649 }
djm@1749 650
djm@1749 651 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
djm@1749 652 return 0;
djm@1749 653 ol2e = mk_l2_pgentry(_ol2e);
djm@1749 654
djm@1749 655 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
djm@1749 656 {
djm@1749 657 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
djm@1749 658 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
djm@1749 659 return update_l2e(pl2e, ol2e, nl2e);
djm@1749 660
cl349@2957 661 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
cl349@2491 662 ((unsigned long)pl2e &
kaf24@2466 663 ~PAGE_MASK) >> 2)) )
djm@1749 664 return 0;
cl349@1860 665
djm@1749 666 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
djm@1749 667 {
djm@1749 668 put_page_from_l2e(nl2e, pfn);
djm@1749 669 return 0;
djm@1749 670 }
djm@1749 671
djm@1749 672 put_page_from_l2e(ol2e, pfn);
djm@1749 673 return 1;
djm@1749 674 }
djm@1749 675
djm@1749 676 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
djm@1749 677 return 0;
djm@1749 678
djm@1749 679 put_page_from_l2e(ol2e, pfn);
djm@1749 680 return 1;
djm@1749 681 }
djm@1749 682
djm@1749 683
djm@1749 684 static inline int update_l1e(l1_pgentry_t *pl1e,
djm@1749 685 l1_pgentry_t ol1e,
djm@1749 686 l1_pgentry_t nl1e)
djm@1749 687 {
djm@1749 688 unsigned long o = l1_pgentry_val(ol1e);
djm@1749 689 unsigned long n = l1_pgentry_val(nl1e);
djm@1749 690
djm@1749 691 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
djm@1749 692 unlikely(o != l1_pgentry_val(ol1e)) )
djm@1749 693 {
djm@1749 694 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
djm@1749 695 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
djm@1749 696 return 0;
djm@1749 697 }
djm@1749 698
djm@1749 699 return 1;
djm@1749 700 }
djm@1749 701
djm@1749 702
djm@1749 703 /* Update the L1 entry at pl1e to new value nl1e. */
djm@1749 704 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
djm@1749 705 {
djm@1749 706 l1_pgentry_t ol1e;
djm@1749 707 unsigned long _ol1e;
cl349@2957 708 struct domain *d = current->domain;
djm@1749 709
djm@1749 710 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
djm@1749 711 {
djm@1749 712 MEM_LOG("Bad get_user\n");
djm@1749 713 return 0;
djm@1749 714 }
djm@1749 715
djm@1749 716 ol1e = mk_l1_pgentry(_ol1e);
djm@1749 717
djm@1749 718 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
djm@1749 719 {
djm@1749 720 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
djm@1749 721 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
djm@1749 722 return update_l1e(pl1e, ol1e, nl1e);
djm@1749 723
kaf24@2314 724 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
djm@1749 725 return 0;
djm@1749 726
djm@1749 727 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
djm@1749 728 {
kaf24@2382 729 put_page_from_l1e(nl1e, d);
djm@1749 730 return 0;
djm@1749 731 }
djm@1749 732
kaf24@2382 733 put_page_from_l1e(ol1e, d);
djm@1749 734 return 1;
djm@1749 735 }
djm@1749 736
djm@1749 737 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
djm@1749 738 return 0;
djm@1749 739
kaf24@2382 740 put_page_from_l1e(ol1e, d);
djm@1749 741 return 1;
djm@1749 742 }
djm@1749 743
djm@1749 744
djm@1749 745 int alloc_page_type(struct pfn_info *page, unsigned int type)
djm@1749 746 {
djm@1749 747 switch ( type )
djm@1749 748 {
djm@1749 749 case PGT_l1_page_table:
djm@1749 750 return alloc_l1_table(page);
djm@1749 751 case PGT_l2_page_table:
djm@1749 752 return alloc_l2_table(page);
djm@1749 753 case PGT_gdt_page:
djm@1749 754 case PGT_ldt_page:
djm@1749 755 return alloc_segdesc_page(page);
djm@1749 756 default:
cl349@2491 757 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
cl349@2491 758 type, page->u.inuse.type_info,
cl349@2491 759 page->count_info);
djm@1749 760 BUG();
djm@1749 761 }
djm@1749 762
djm@1749 763 return 0;
djm@1749 764 }
djm@1749 765
djm@1749 766
djm@1749 767 void free_page_type(struct pfn_info *page, unsigned int type)
djm@1749 768 {
kaf24@2314 769 struct domain *d = page->u.inuse.domain;
kaf24@2314 770
djm@1749 771 switch ( type )
djm@1749 772 {
djm@1749 773 case PGT_l1_page_table:
djm@1749 774 free_l1_table(page);
djm@1749 775 break;
djm@1749 776
djm@1749 777 case PGT_l2_page_table:
djm@1749 778 free_l2_table(page);
djm@1749 779 break;
djm@1749 780
djm@1749 781 default:
djm@1749 782 BUG();
djm@1749 783 }
kaf24@2314 784
cl349@2957 785 if ( unlikely(d->exec_domain[0]->mm.shadow_mode) &&
cl349@2957 786 (get_shadow_status(&d->exec_domain[0]->mm, page_to_pfn(page)) & PSH_shadowed) )
kaf24@2314 787 {
kaf24@2314 788 unshadow_table(page_to_pfn(page), type);
cl349@2957 789 put_shadow_status(&d->exec_domain[0]->mm);
kaf24@2314 790 }
djm@1749 791 }
djm@1749 792
djm@1749 793
kaf24@2498 794 void put_page_type(struct pfn_info *page)
kaf24@2498 795 {
kaf24@2498 796 u32 nx, x, y = page->u.inuse.type_info;
kaf24@2498 797
kaf24@2498 798 again:
kaf24@2498 799 do {
kaf24@2498 800 x = y;
kaf24@2498 801 nx = x - 1;
kaf24@2498 802
kaf24@2498 803 ASSERT((x & PGT_count_mask) != 0);
kaf24@2588 804
kaf24@2588 805 /*
kaf24@2588 806 * The page should always be validated while a reference is held. The
kaf24@2588 807 * exception is during domain destruction, when we forcibly invalidate
kaf24@2588 808 * page-table pages if we detect a referential loop.
kaf24@2588 809 * See domain.c:relinquish_list().
kaf24@2588 810 */
kaf24@2588 811 ASSERT((x & PGT_validated) ||
cl349@3036 812 test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
kaf24@2498 813
kaf24@2498 814 if ( unlikely((nx & PGT_count_mask) == 0) )
kaf24@2498 815 {
kaf24@2498 816 /* Record TLB information for flush later. Races are harmless. */
kaf24@2790 817 page->tlbflush_timestamp = tlbflush_current_time();
kaf24@2498 818
kaf24@2588 819 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
kaf24@2588 820 likely(nx & PGT_validated) )
kaf24@2498 821 {
kaf24@2498 822 /*
kaf24@2498 823 * Page-table pages must be unvalidated when count is zero. The
kaf24@2498 824 * 'free' is safe because the refcnt is non-zero and validated
kaf24@2498 825 * bit is clear => other ops will spin or fail.
kaf24@2498 826 */
kaf24@2498 827 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
kaf24@2498 828 x & ~PGT_validated)) != x) )
kaf24@2498 829 goto again;
kaf24@2498 830 /* We cleared the 'valid bit' so we do the clear up. */
kaf24@2498 831 free_page_type(page, x & PGT_type_mask);
kaf24@2498 832 /* Carry on, but with the 'valid bit' now clear. */
kaf24@2498 833 x &= ~PGT_validated;
kaf24@2498 834 nx &= ~PGT_validated;
kaf24@2498 835 }
kaf24@2498 836 }
cl349@2644 837 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
kaf24@2498 838 (PGT_pinned | 1)) )
cl349@2644 839 {
kaf24@2498 840 /* Page is now only pinned. Make the back pointer mutable again. */
cl349@2644 841 nx |= PGT_va_mutable;
cl349@2644 842 }
kaf24@2498 843 }
kaf24@2498 844 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
kaf24@2498 845 }
kaf24@2498 846
kaf24@2498 847
kaf24@2498 848 int get_page_type(struct pfn_info *page, u32 type)
kaf24@2498 849 {
kaf24@2498 850 u32 nx, x, y = page->u.inuse.type_info;
kaf24@2498 851
kaf24@2498 852 again:
kaf24@2498 853 do {
kaf24@2498 854 x = y;
kaf24@2498 855 nx = x + 1;
kaf24@2498 856 if ( unlikely((nx & PGT_count_mask) == 0) )
kaf24@2498 857 {
kaf24@2498 858 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
kaf24@2498 859 return 0;
kaf24@2498 860 }
kaf24@2498 861 else if ( unlikely((x & PGT_count_mask) == 0) )
kaf24@2498 862 {
kaf24@2498 863 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
kaf24@2498 864 {
kaf24@2498 865 /*
kaf24@2498 866 * On type change we check to flush stale TLB entries. This
kaf24@2498 867 * may be unnecessary (e.g., page was GDT/LDT) but those
kaf24@2498 868 * circumstances should be very rare.
kaf24@2498 869 */
kaf24@2498 870 struct domain *d = page->u.inuse.domain;
cl349@2957 871 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
kaf24@2498 872 page->tlbflush_timestamp)) )
kaf24@2498 873 {
kaf24@2498 874 perfc_incr(need_flush_tlb_flush);
cl349@2957 875 flush_tlb_cpu(d->exec_domain[0]->processor);
kaf24@2498 876 }
kaf24@2498 877
kaf24@2498 878 /* We lose existing type, back pointer, and validity. */
kaf24@2498 879 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
kaf24@2498 880 nx |= type;
kaf24@2498 881
kaf24@2498 882 /* No special validation needed for writable pages. */
kaf24@2498 883 /* Page tables and GDT/LDT need to be scanned for validity. */
kaf24@2498 884 if ( type == PGT_writable_page )
kaf24@2498 885 nx |= PGT_validated;
kaf24@2498 886 }
kaf24@2498 887 }
kaf24@2498 888 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
kaf24@2498 889 {
kaf24@2498 890 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
kaf24@2498 891 {
kaf24@2498 892 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
kaf24@2498 893 ((type & PGT_type_mask) != PGT_l1_page_table) )
kaf24@2498 894 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
kaf24@2498 895 x & PGT_type_mask, type, page_to_pfn(page));
kaf24@2498 896 return 0;
kaf24@2498 897 }
kaf24@2498 898 else if ( (x & PGT_va_mask) == PGT_va_mutable )
kaf24@2498 899 {
kaf24@2498 900 /* The va backpointer is mutable, hence we update it. */
kaf24@2498 901 nx &= ~PGT_va_mask;
kaf24@2498 902 nx |= type; /* we know the actual type is correct */
kaf24@2498 903 }
kaf24@2498 904 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
kaf24@2498 905 {
kaf24@2506 906 /* This table is potentially mapped at multiple locations. */
kaf24@2506 907 nx &= ~PGT_va_mask;
kaf24@2506 908 nx |= PGT_va_unknown;
kaf24@2498 909 }
kaf24@2498 910 }
cl349@2644 911 else if ( unlikely(!(x & PGT_validated)) )
kaf24@2498 912 {
kaf24@2498 913 /* Someone else is updating validation of this page. Wait... */
kaf24@2498 914 while ( (y = page->u.inuse.type_info) == x )
kaf24@2498 915 {
kaf24@2498 916 rep_nop();
kaf24@2498 917 barrier();
kaf24@2498 918 }
kaf24@2498 919 goto again;
kaf24@2498 920 }
kaf24@2498 921 }
kaf24@2498 922 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
kaf24@2498 923
kaf24@2498 924 if ( unlikely(!(nx & PGT_validated)) )
kaf24@2498 925 {
kaf24@2498 926 /* Try to validate page type; drop the new reference on failure. */
kaf24@2498 927 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
kaf24@2498 928 {
kaf24@2498 929 MEM_LOG("Error while validating pfn %08lx for type %08x."
kaf24@2498 930 " caf=%08x taf=%08x\n",
kaf24@2498 931 page_to_pfn(page), type,
cl349@2644 932 page->count_info,
cl349@2644 933 page->u.inuse.type_info);
kaf24@2498 934 /* Noone else can get a reference. We hold the only ref. */
kaf24@2498 935 page->u.inuse.type_info = 0;
kaf24@2498 936 return 0;
kaf24@2498 937 }
kaf24@2498 938
kaf24@2498 939 /* Noone else is updating simultaneously. */
kaf24@2498 940 __set_bit(_PGT_validated, &page->u.inuse.type_info);
kaf24@2498 941 }
kaf24@2498 942
kaf24@2498 943 return 1;
kaf24@2498 944 }
kaf24@2498 945
kaf24@2498 946
kaf24@3443 947 int new_guest_cr3(unsigned long pfn)
kaf24@3443 948 {
kaf24@3443 949 struct exec_domain *ed = current;
kaf24@3443 950 struct domain *d = ed->domain;
kaf24@3443 951 int okay, cpu = smp_processor_id();
kaf24@3443 952 unsigned long old_base_pfn;
kaf24@3443 953
kaf24@3443 954 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
kaf24@3443 955 if ( likely(okay) )
kaf24@3443 956 {
kaf24@3443 957 invalidate_shadow_ldt(ed);
kaf24@3443 958
kaf24@3443 959 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
kaf24@3443 960 old_base_pfn = pagetable_val(ed->mm.pagetable) >> PAGE_SHIFT;
kaf24@3443 961 ed->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
kaf24@3443 962
kaf24@3443 963 shadow_mk_pagetable(&ed->mm);
kaf24@3443 964
kaf24@3443 965 write_ptbase(&ed->mm);
kaf24@3443 966
kaf24@3443 967 put_page_and_type(&frame_table[old_base_pfn]);
kaf24@3443 968 }
kaf24@3443 969 else
kaf24@3443 970 {
kaf24@3517 971 MEM_LOG("Error while installing new baseptr %08lx", pfn);
kaf24@3443 972 }
kaf24@3443 973
kaf24@3443 974 return okay;
kaf24@3443 975 }
kaf24@3443 976
djm@1749 977 static int do_extended_command(unsigned long ptr, unsigned long val)
djm@1749 978 {
djm@1749 979 int okay = 1, cpu = smp_processor_id();
djm@1749 980 unsigned int cmd = val & MMUEXT_CMD_MASK;
djm@1749 981 unsigned long pfn = ptr >> PAGE_SHIFT;
djm@1749 982 struct pfn_info *page = &frame_table[pfn];
cl349@2957 983 struct exec_domain *ed = current;
cl349@2957 984 struct domain *d = ed->domain, *nd, *e;
djm@1749 985 u32 x, y;
djm@1749 986 domid_t domid;
kaf24@2385 987 grant_ref_t gntref;
djm@1749 988
djm@1749 989 switch ( cmd )
djm@1749 990 {
kaf24@2465 991 case MMUEXT_PIN_L1_TABLE:
kaf24@2465 992 case MMUEXT_PIN_L2_TABLE:
kaf24@2466 993 /*
kaf24@2466 994 * We insist that, if you pin an L1 page, it's the first thing that
kaf24@2466 995 * you do to it. This is because we require the backptr to still be
kaf24@2466 996 * mutable. This assumption seems safe.
kaf24@2466 997 */
djm@1749 998 okay = get_page_and_type_from_pagenr(
kaf24@2465 999 pfn,
kaf24@2465 1000 ((cmd==MMUEXT_PIN_L2_TABLE) ?
cl349@2491 1001 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
kaf24@2465 1002 FOREIGNDOM);
iap10@2458 1003
djm@1749 1004 if ( unlikely(!okay) )
djm@1749 1005 {
djm@1749 1006 MEM_LOG("Error while pinning pfn %08lx", pfn);
djm@1749 1007 break;
djm@1749 1008 }
djm@1749 1009
kaf24@2466 1010 if ( unlikely(test_and_set_bit(_PGT_pinned,
kaf24@2466 1011 &page->u.inuse.type_info)) )
djm@1749 1012 {
djm@1749 1013 MEM_LOG("Pfn %08lx already pinned", pfn);
djm@1749 1014 put_page_and_type(page);
djm@1749 1015 okay = 0;
djm@1749 1016 break;
djm@1749 1017 }
djm@1749 1018
djm@1749 1019 break;
djm@1749 1020
djm@1749 1021 case MMUEXT_UNPIN_TABLE:
kaf24@2314 1022 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
djm@1749 1023 {
djm@1749 1024 MEM_LOG("Page %08lx bad domain (dom=%p)",
kaf24@1970 1025 ptr, page->u.inuse.domain);
djm@1749 1026 }
kaf24@2466 1027 else if ( likely(test_and_clear_bit(_PGT_pinned,
kaf24@2466 1028 &page->u.inuse.type_info)) )
djm@1749 1029 {
djm@1749 1030 put_page_and_type(page);
djm@1749 1031 put_page(page);
djm@1749 1032 }
djm@1749 1033 else
djm@1749 1034 {
djm@1749 1035 okay = 0;
djm@1749 1036 put_page(page);
djm@1749 1037 MEM_LOG("Pfn %08lx not pinned", pfn);
djm@1749 1038 }
djm@1749 1039 break;
djm@1749 1040
djm@1749 1041 case MMUEXT_NEW_BASEPTR:
kaf24@3443 1042 okay = new_guest_cr3(pfn);
djm@1749 1043 break;
djm@1749 1044
djm@1749 1045 case MMUEXT_TLB_FLUSH:
djm@1749 1046 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
djm@1749 1047 break;
djm@1749 1048
djm@1749 1049 case MMUEXT_INVLPG:
djm@1749 1050 __flush_tlb_one(ptr);
djm@1749 1051 break;
djm@1749 1052
kaf24@2463 1053 case MMUEXT_FLUSH_CACHE:
kaf24@2463 1054 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
kaf24@2463 1055 {
kaf24@2463 1056 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
kaf24@2463 1057 okay = 0;
kaf24@2463 1058 }
kaf24@2463 1059 else
kaf24@2463 1060 {
kaf24@2463 1061 wbinvd();
kaf24@2463 1062 }
kaf24@2463 1063 break;
kaf24@2463 1064
djm@1749 1065 case MMUEXT_SET_LDT:
djm@1749 1066 {
djm@1749 1067 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
djm@1749 1068 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
djm@1749 1069 (ents > 8192) ||
djm@1749 1070 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
djm@1749 1071 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
djm@1749 1072 {
djm@1749 1073 okay = 0;
djm@1749 1074 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
djm@1749 1075 }
cl349@2957 1076 else if ( (ed->mm.ldt_ents != ents) ||
cl349@2957 1077 (ed->mm.ldt_base != ptr) )
djm@1749 1078 {
cl349@2957 1079 invalidate_shadow_ldt(ed);
cl349@2957 1080 ed->mm.ldt_base = ptr;
cl349@2957 1081 ed->mm.ldt_ents = ents;
cl349@2957 1082 load_LDT(ed);
djm@1749 1083 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
djm@1749 1084 if ( ents != 0 )
djm@1749 1085 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
djm@1749 1086 }
djm@1749 1087 break;
djm@1749 1088 }
djm@1749 1089
kaf24@2314 1090 case MMUEXT_SET_FOREIGNDOM:
kaf24@2314 1091 domid = (domid_t)(val >> 16);
djm@1749 1092
kaf24@2362 1093 if ( (e = percpu_info[cpu].foreign) != NULL )
kaf24@2362 1094 put_domain(e);
kaf24@2362 1095 percpu_info[cpu].foreign = NULL;
kaf24@2362 1096
djm@1749 1097 if ( !IS_PRIV(d) )
djm@1749 1098 {
kaf24@2336 1099 switch ( domid )
kaf24@2336 1100 {
kaf24@2336 1101 case DOMID_IO:
kaf24@2362 1102 get_knownalive_domain(dom_io);
kaf24@2362 1103 percpu_info[cpu].foreign = dom_io;
kaf24@2336 1104 break;
kaf24@2336 1105 default:
kaf24@2748 1106 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
kaf24@2336 1107 okay = 0;
kaf24@2336 1108 break;
kaf24@2336 1109 }
djm@1749 1110 }
djm@1749 1111 else
djm@1749 1112 {
kaf24@2314 1113 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
kaf24@2314 1114 if ( e == NULL )
djm@1749 1115 {
kaf24@2336 1116 switch ( domid )
kaf24@2336 1117 {
kaf24@2336 1118 case DOMID_XEN:
kaf24@2362 1119 get_knownalive_domain(dom_xen);
kaf24@2362 1120 percpu_info[cpu].foreign = dom_xen;
kaf24@2336 1121 break;
kaf24@2336 1122 case DOMID_IO:
kaf24@2362 1123 get_knownalive_domain(dom_io);
kaf24@2362 1124 percpu_info[cpu].foreign = dom_io;
kaf24@2336 1125 break;
kaf24@2336 1126 default:
kaf24@2336 1127 MEM_LOG("Unknown domain '%u'", domid);
kaf24@2336 1128 okay = 0;
kaf24@2336 1129 break;
kaf24@2336 1130 }
djm@1749 1131 }
djm@1749 1132 }
djm@1749 1133 break;
djm@1749 1134
kaf24@2385 1135 case MMUEXT_TRANSFER_PAGE:
kaf24@2385 1136 domid = (domid_t)(val >> 16);
kaf24@2385 1137 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
kaf24@2385 1138
kaf24@2385 1139 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
kaf24@2385 1140 unlikely(!pfn_is_ram(pfn)) ||
kaf24@2385 1141 unlikely((e = find_domain_by_id(domid)) == NULL) )
kaf24@2385 1142 {
kaf24@2385 1143 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
kaf24@2385 1144 okay = 0;
kaf24@2385 1145 break;
kaf24@2385 1146 }
kaf24@2385 1147
kaf24@2385 1148 spin_lock(&d->page_alloc_lock);
kaf24@2385 1149
kaf24@2385 1150 /*
kaf24@2385 1151 * The tricky bit: atomically release ownership while there is just one
kaf24@2385 1152 * benign reference to the page (PGC_allocated). If that reference
kaf24@2385 1153 * disappears then the deallocation routine will safely spin.
kaf24@2385 1154 */
kaf24@2385 1155 nd = page->u.inuse.domain;
kaf24@2385 1156 y = page->count_info;
kaf24@2385 1157 do {
kaf24@2385 1158 x = y;
kaf24@2385 1159 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
kaf24@2385 1160 (1|PGC_allocated)) ||
kaf24@2385 1161 unlikely(nd != d) )
kaf24@2385 1162 {
kaf24@2385 1163 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
kaf24@2385 1164 " caf=%08x, taf=%08x\n", page_to_pfn(page),
kaf24@2748 1165 d, d->id, nd, x, page->u.inuse.type_info);
kaf24@2385 1166 spin_unlock(&d->page_alloc_lock);
kaf24@2385 1167 put_domain(e);
kaf24@2663 1168 return 0;
kaf24@2385 1169 }
kaf24@2385 1170 __asm__ __volatile__(
kaf24@2385 1171 LOCK_PREFIX "cmpxchg8b %2"
kaf24@2385 1172 : "=d" (nd), "=a" (y),
kaf24@2385 1173 "=m" (*(volatile u64 *)(&page->count_info))
kaf24@2385 1174 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
kaf24@2385 1175 }
kaf24@2385 1176 while ( unlikely(nd != d) || unlikely(y != x) );
kaf24@2385 1177
kaf24@2385 1178 /*
kaf24@2385 1179 * Unlink from 'd'. At least one reference remains (now anonymous), so
kaf24@2385 1180 * noone else is spinning to try to delete this page from 'd'.
kaf24@2385 1181 */
kaf24@2385 1182 d->tot_pages--;
kaf24@2385 1183 list_del(&page->list);
kaf24@2385 1184
kaf24@2385 1185 spin_unlock(&d->page_alloc_lock);
kaf24@2385 1186
kaf24@2385 1187 spin_lock(&e->page_alloc_lock);
kaf24@2385 1188
kaf24@2466 1189 /*
kaf24@2466 1190 * Check that 'e' will accept the page and has reservation headroom.
kaf24@2466 1191 * Also, a domain mustn't have PGC_allocated pages when it is dying.
kaf24@2466 1192 */
kaf24@2385 1193 ASSERT(e->tot_pages <= e->max_pages);
cl349@2957 1194 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
kaf24@2466 1195 unlikely(e->tot_pages == e->max_pages) ||
kaf24@2385 1196 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
kaf24@2385 1197 {
kaf24@2431 1198 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
kaf24@2469 1199 "provided a bad grant ref, or is dying (%08lx).\n",
cl349@2957 1200 e->tot_pages, e->max_pages, e->d_flags);
kaf24@2385 1201 spin_unlock(&e->page_alloc_lock);
kaf24@2385 1202 put_domain(e);
kaf24@2385 1203 okay = 0;
kaf24@2385 1204 break;
kaf24@2385 1205 }
kaf24@2385 1206
kaf24@2385 1207 /* Okay, add the page to 'e'. */
kaf24@2385 1208 if ( unlikely(e->tot_pages++ == 0) )
kaf24@2385 1209 get_knownalive_domain(e);
kaf24@2385 1210 list_add_tail(&page->list, &e->page_list);
kaf24@2385 1211 page->u.inuse.domain = e;
kaf24@2385 1212
kaf24@2385 1213 spin_unlock(&e->page_alloc_lock);
kaf24@2385 1214
kaf24@2385 1215 /* Transfer is all done: tell the guest about its new page frame. */
kaf24@2385 1216 gnttab_notify_transfer(e, gntref, pfn);
kaf24@2385 1217
kaf24@2385 1218 put_domain(e);
kaf24@2385 1219 break;
kaf24@2385 1220
djm@1749 1221 case MMUEXT_REASSIGN_PAGE:
djm@1749 1222 if ( unlikely(!IS_PRIV(d)) )
djm@1749 1223 {
kaf24@2748 1224 MEM_LOG("Dom %u has no reassignment priv", d->id);
djm@1749 1225 okay = 0;
djm@1749 1226 break;
djm@1749 1227 }
djm@1749 1228
kaf24@2314 1229 e = percpu_info[cpu].foreign;
kaf24@2314 1230 if ( unlikely(e == NULL) )
djm@1749 1231 {
kaf24@2314 1232 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
djm@1749 1233 okay = 0;
djm@1749 1234 break;
djm@1749 1235 }
djm@1749 1236
djm@1749 1237 /*
djm@1749 1238 * Grab both page_list locks, in order. This prevents the page from
djm@1749 1239 * disappearing elsewhere while we modify the owner, and we'll need
djm@1749 1240 * both locks if we're successful so that we can change lists.
djm@1749 1241 */
djm@1749 1242 if ( d < e )
djm@1749 1243 {
djm@1749 1244 spin_lock(&d->page_alloc_lock);
djm@1749 1245 spin_lock(&e->page_alloc_lock);
djm@1749 1246 }
djm@1749 1247 else
djm@1749 1248 {
djm@1749 1249 spin_lock(&e->page_alloc_lock);
djm@1749 1250 spin_lock(&d->page_alloc_lock);
djm@1749 1251 }
djm@1749 1252
djm@1749 1253 /* A domain shouldn't have PGC_allocated pages when it is dying. */
cl349@2957 1254 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
djm@1749 1255 unlikely(IS_XEN_HEAP_FRAME(page)) )
djm@1749 1256 {
kaf24@1871 1257 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
djm@1749 1258 okay = 0;
djm@1749 1259 goto reassign_fail;
djm@1749 1260 }
djm@1749 1261
djm@1749 1262 /*
djm@1749 1263 * The tricky bit: atomically change owner while there is just one
djm@1749 1264 * benign reference to the page (PGC_allocated). If that reference
djm@1749 1265 * disappears then the deallocation routine will safely spin.
djm@1749 1266 */
kaf24@1970 1267 nd = page->u.inuse.domain;
kaf24@2384 1268 y = page->count_info;
djm@1749 1269 do {
djm@1749 1270 x = y;
djm@1749 1271 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
djm@1749 1272 (1|PGC_allocated)) ||
djm@1749 1273 unlikely(nd != d) )
djm@1749 1274 {
djm@1749 1275 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
djm@1749 1276 " caf=%08x, taf=%08x\n", page_to_pfn(page),
kaf24@2748 1277 d, d->id, nd, x, page->u.inuse.type_info);
djm@1749 1278 okay = 0;
djm@1749 1279 goto reassign_fail;
djm@1749 1280 }
djm@1749 1281 __asm__ __volatile__(
djm@1749 1282 LOCK_PREFIX "cmpxchg8b %3"
kaf24@2384 1283 : "=d" (nd), "=a" (y), "=c" (e),
kaf24@2384 1284 "=m" (*(volatile u64 *)(&page->count_info))
kaf24@2384 1285 : "0" (d), "1" (x), "c" (e), "b" (x) );
djm@1749 1286 }
djm@1749 1287 while ( unlikely(nd != d) || unlikely(y != x) );
djm@1749 1288
djm@1749 1289 /*
djm@1749 1290 * Unlink from 'd'. We transferred at least one reference to 'e', so
djm@1749 1291 * noone else is spinning to try to delete this page from 'd'.
djm@1749 1292 */
djm@1749 1293 d->tot_pages--;
djm@1749 1294 list_del(&page->list);
djm@1749 1295
djm@1749 1296 /*
djm@1749 1297 * Add the page to 'e'. Someone may already have removed the last
djm@1749 1298 * reference and want to remove the page from 'e'. However, we have
djm@1749 1299 * the lock so they'll spin waiting for us.
djm@1749 1300 */
djm@1749 1301 if ( unlikely(e->tot_pages++ == 0) )
kaf24@2336 1302 get_knownalive_domain(e);
djm@1749 1303 list_add_tail(&page->list, &e->page_list);
djm@1749 1304
djm@1749 1305 reassign_fail:
djm@1749 1306 spin_unlock(&d->page_alloc_lock);
djm@1749 1307 spin_unlock(&e->page_alloc_lock);
djm@1749 1308 break;
djm@1749 1309
kaf24@2314 1310 case MMUEXT_CLEAR_FOREIGNDOM:
kaf24@2314 1311 if ( (e = percpu_info[cpu].foreign) != NULL )
kaf24@2314 1312 put_domain(e);
kaf24@2314 1313 percpu_info[cpu].foreign = NULL;
djm@1749 1314 break;
djm@1749 1315
djm@1749 1316 default:
djm@1749 1317 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
djm@1749 1318 okay = 0;
djm@1749 1319 break;
djm@1749 1320 }
djm@1749 1321
djm@1749 1322 return okay;
djm@1749 1323 }
djm@1749 1324
kaf24@3177 1325 int do_mmu_update(
kaf24@3177 1326 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
kaf24@3177 1327 {
kaf24@3177 1328 /*
kaf24@3177 1329 * We steal the m.s.b. of the @count parameter to indicate whether this
kaf24@3177 1330 * invocation of do_mmu_update() is resuming a previously preempted call.
kaf24@3187 1331 * We steal the next 15 bits to remember the current FOREIGNDOM.
kaf24@3177 1332 */
kaf24@3187 1333 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
kaf24@3187 1334 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
kaf24@3187 1335 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
djm@1749 1336
djm@1749 1337 mmu_update_t req;
djm@1749 1338 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
djm@1749 1339 struct pfn_info *page;
kaf24@3187 1340 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
kaf24@3177 1341 unsigned int cmd, done = 0;
djm@1749 1342 unsigned long prev_spfn = 0;
djm@1749 1343 l1_pgentry_t *prev_spl1e = 0;
cl349@2957 1344 struct exec_domain *ed = current;
cl349@2957 1345 struct domain *d = ed->domain;
kaf24@2466 1346 u32 type_info;
kaf24@3187 1347 domid_t domid;
djm@1749 1348
cl349@3036 1349 LOCK_BIGLOCK(d);
cl349@3036 1350
kaf24@3517 1351 cleanup_writable_pagetable(d);
kaf24@2375 1352
kaf24@3177 1353 /*
kaf24@3177 1354 * If we are resuming after preemption, read how much work we have already
kaf24@3177 1355 * done. This allows us to set the @done output parameter correctly.
kaf24@3187 1356 * We also reset FOREIGNDOM here.
kaf24@3177 1357 */
kaf24@3187 1358 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
kaf24@3177 1359 {
kaf24@3187 1360 if ( !(count & MMU_UPDATE_PREEMPTED) )
kaf24@3187 1361 {
kaf24@3187 1362 /* Count overflow into private FOREIGNDOM field. */
kaf24@3187 1363 MEM_LOG("do_mmu_update count is too large");
kaf24@3187 1364 rc = -EINVAL;
kaf24@3187 1365 goto out;
kaf24@3187 1366 }
kaf24@3177 1367 count &= ~MMU_UPDATE_PREEMPTED;
kaf24@3187 1368 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
kaf24@3187 1369 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
kaf24@3177 1370 if ( unlikely(pdone != NULL) )
kaf24@3177 1371 (void)get_user(done, pdone);
cl349@3193 1372 if ( (domid != current->domain->id) &&
kaf24@3187 1373 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
kaf24@3187 1374 {
kaf24@3187 1375 rc = -EINVAL;
kaf24@3187 1376 goto out;
kaf24@3187 1377 }
kaf24@3177 1378 }
kaf24@3177 1379
kaf24@3269 1380 perfc_incrc(calls_to_mmu_update);
kaf24@3269 1381 perfc_addc(num_page_updates, count);
kaf24@3269 1382
kaf24@3177 1383 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
kaf24@3187 1384 {
kaf24@3187 1385 rc = -EFAULT;
kaf24@3187 1386 goto out;
kaf24@3187 1387 }
cl349@1860 1388
djm@1749 1389 for ( i = 0; i < count; i++ )
djm@1749 1390 {
kaf24@3177 1391 if ( hypercall_preempt_check() )
kaf24@3177 1392 {
kaf24@3187 1393 rc = hypercall_create_continuation(
kaf24@3177 1394 __HYPERVISOR_mmu_update, 3, ureqs,
kaf24@3187 1395 (count - i) |
kaf24@3187 1396 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
kaf24@3187 1397 MMU_UPDATE_PREEMPTED, pdone);
kaf24@3177 1398 break;
kaf24@3177 1399 }
kaf24@3129 1400
kaf24@2375 1401 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
djm@1749 1402 {
kaf24@2375 1403 MEM_LOG("Bad __copy_from_user");
djm@1749 1404 rc = -EFAULT;
djm@1749 1405 break;
djm@1749 1406 }
djm@1749 1407
djm@1749 1408 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
djm@1749 1409 pfn = req.ptr >> PAGE_SHIFT;
djm@1749 1410
djm@1749 1411 okay = 0;
djm@1749 1412
djm@1749 1413 switch ( cmd )
djm@1749 1414 {
djm@1749 1415 /*
djm@1749 1416 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
djm@1749 1417 */
djm@1749 1418 case MMU_NORMAL_PT_UPDATE:
cl349@2957 1419 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
djm@1749 1420 {
djm@1749 1421 MEM_LOG("Could not get page for normal update");
djm@1749 1422 break;
djm@1749 1423 }
djm@1749 1424
djm@1749 1425 if ( likely(prev_pfn == pfn) )
djm@1749 1426 {
djm@1749 1427 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
djm@1749 1428 }
djm@1749 1429 else
djm@1749 1430 {
djm@1749 1431 if ( prev_pfn != 0 )
djm@1749 1432 unmap_domain_mem((void *)va);
djm@1749 1433 va = (unsigned long)map_domain_mem(req.ptr);
djm@1749 1434 prev_pfn = pfn;
djm@1749 1435 }
djm@1749 1436
djm@1749 1437 page = &frame_table[pfn];
kaf24@2466 1438 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
djm@1749 1439 {
djm@1749 1440 case PGT_l1_page_table:
kaf24@2466 1441 if ( likely(get_page_type(
kaf24@2466 1442 page, type_info & (PGT_type_mask|PGT_va_mask))) )
djm@1749 1443 {
djm@1749 1444 okay = mod_l1_entry((l1_pgentry_t *)va,
djm@1749 1445 mk_l1_pgentry(req.val));
djm@1749 1446
cl349@2957 1447 if ( unlikely(ed->mm.shadow_mode) && okay &&
cl349@2957 1448 (get_shadow_status(&ed->mm, page-frame_table) &
djm@1749 1449 PSH_shadowed) )
djm@1749 1450 {
kaf24@2375 1451 shadow_l1_normal_pt_update(
kaf24@2375 1452 req.ptr, req.val, &prev_spfn, &prev_spl1e);
cl349@2957 1453 put_shadow_status(&ed->mm);
djm@1749 1454 }
djm@1749 1455
djm@1749 1456 put_page_type(page);
djm@1749 1457 }
djm@1749 1458 break;
djm@1749 1459 case PGT_l2_page_table:
djm@1749 1460 if ( likely(get_page_type(page, PGT_l2_page_table)) )
djm@1749 1461 {
djm@1749 1462 okay = mod_l2_entry((l2_pgentry_t *)va,
djm@1749 1463 mk_l2_pgentry(req.val),
djm@1749 1464 pfn);
djm@1749 1465
cl349@2957 1466 if ( unlikely(ed->mm.shadow_mode) && okay &&
cl349@2957 1467 (get_shadow_status(&ed->mm, page-frame_table) &
djm@1749 1468 PSH_shadowed) )
djm@1749 1469 {
kaf24@2375 1470 shadow_l2_normal_pt_update(req.ptr, req.val);
cl349@2957 1471 put_shadow_status(&ed->mm);
djm@1749 1472 }
djm@1749 1473
djm@1749 1474 put_page_type(page);
djm@1749 1475 }
djm@1749 1476 break;
djm@1749 1477 default:
kaf24@2375 1478 if ( likely(get_page_type(page, PGT_writable_page)) )
djm@1749 1479 {
djm@1749 1480 *(unsigned long *)va = req.val;
djm@1749 1481 okay = 1;
djm@1749 1482 put_page_type(page);
djm@1749 1483 }
djm@1749 1484 break;
djm@1749 1485 }
djm@1749 1486
djm@1749 1487 put_page(page);
djm@1749 1488 break;
djm@1749 1489
djm@1749 1490 case MMU_MACHPHYS_UPDATE:
kaf24@2314 1491 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
djm@1749 1492 {
djm@1749 1493 MEM_LOG("Could not get page for mach->phys update");
djm@1749 1494 break;
djm@1749 1495 }
djm@1749 1496
djm@1749 1497 machine_to_phys_mapping[pfn] = req.val;
djm@1749 1498 okay = 1;
djm@1749 1499
djm@1749 1500 /*
djm@1749 1501 * If in log-dirty mode, mark the corresponding pseudo-physical
djm@1749 1502 * page as dirty.
djm@1749 1503 */
cl349@2957 1504 if ( unlikely(ed->mm.shadow_mode == SHM_logdirty) &&
cl349@2957 1505 mark_dirty(&ed->mm, pfn) )
cl349@2957 1506 ed->mm.shadow_dirty_block_count++;
djm@1749 1507
djm@1749 1508 put_page(&frame_table[pfn]);
djm@1749 1509 break;
djm@1749 1510
djm@1749 1511 /*
djm@1749 1512 * MMU_EXTENDED_COMMAND: Extended command is specified
djm@1749 1513 * in the least-siginificant bits of the 'value' field.
djm@1749 1514 */
djm@1749 1515 case MMU_EXTENDED_COMMAND:
djm@1749 1516 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
djm@1749 1517 okay = do_extended_command(req.ptr, req.val);
djm@1749 1518 break;
djm@1749 1519
djm@1749 1520 default:
djm@1749 1521 MEM_LOG("Invalid page update command %08lx", req.ptr);
djm@1749 1522 break;
djm@1749 1523 }
djm@1749 1524
djm@1749 1525 if ( unlikely(!okay) )
djm@1749 1526 {
djm@1749 1527 rc = -EINVAL;
djm@1749 1528 break;
djm@1749 1529 }
djm@1749 1530
djm@1749 1531 ureqs++;
djm@1749 1532 }
djm@1749 1533
kaf24@3187 1534 out:
djm@1749 1535 if ( prev_pfn != 0 )
djm@1749 1536 unmap_domain_mem((void *)va);
djm@1749 1537
kaf24@2375 1538 if ( unlikely(prev_spl1e != 0) )
djm@1749 1539 unmap_domain_mem((void *)prev_spl1e);
djm@1749 1540
djm@1749 1541 deferred_ops = percpu_info[cpu].deferred_ops;
djm@1749 1542 percpu_info[cpu].deferred_ops = 0;
djm@1749 1543
djm@1749 1544 if ( deferred_ops & DOP_FLUSH_TLB )
djm@1749 1545 local_flush_tlb();
kaf24@2375 1546
djm@1749 1547 if ( deferred_ops & DOP_RELOAD_LDT )
djm@1749 1548 (void)map_ldt_shadow_page(0);
djm@1749 1549
kaf24@2314 1550 if ( unlikely(percpu_info[cpu].foreign != NULL) )
djm@1749 1551 {
kaf24@2314 1552 put_domain(percpu_info[cpu].foreign);
kaf24@2314 1553 percpu_info[cpu].foreign = NULL;
djm@1749 1554 }
djm@1749 1555
kaf24@3177 1556 /* Add incremental work we have done to the @done output parameter. */
kaf24@3177 1557 if ( unlikely(pdone != NULL) )
kaf24@3177 1558 __put_user(done + i, pdone);
djm@1749 1559
cl349@3036 1560 UNLOCK_BIGLOCK(d);
djm@1749 1561 return rc;
djm@1749 1562 }
djm@1749 1563
djm@1749 1564
djm@1749 1565 int do_update_va_mapping(unsigned long page_nr,
djm@1749 1566 unsigned long val,
djm@1749 1567 unsigned long flags)
djm@1749 1568 {
cl349@2957 1569 struct exec_domain *ed = current;
cl349@2957 1570 struct domain *d = ed->domain;
djm@1749 1571 int err = 0;
cl349@2957 1572 unsigned int cpu = ed->processor;
djm@1749 1573 unsigned long deferred_ops;
djm@1749 1574
djm@1749 1575 perfc_incrc(calls_to_update_va);
djm@1749 1576
djm@1749 1577 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
djm@1749 1578 return -EINVAL;
djm@1749 1579
cl349@3036 1580 LOCK_BIGLOCK(d);
cl349@3036 1581
kaf24@3517 1582 cleanup_writable_pagetable(d);
cl349@1879 1583
djm@1749 1584 /*
djm@1749 1585 * XXX When we make this support 4MB superpages we should also deal with
djm@1749 1586 * the case of updating L2 entries.
djm@1749 1587 */
djm@1749 1588
djm@1749 1589 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
djm@1749 1590 mk_l1_pgentry(val))) )
djm@1749 1591 err = -EINVAL;
djm@1749 1592
cl349@2957 1593 if ( unlikely(ed->mm.shadow_mode) )
djm@1749 1594 {
djm@1749 1595 unsigned long sval;
djm@1749 1596
cl349@2957 1597 l1pte_propagate_from_guest(&ed->mm, &val, &sval);
djm@1749 1598
djm@1749 1599 if ( unlikely(__put_user(sval, ((unsigned long *)(
djm@1749 1600 &shadow_linear_pg_table[page_nr])))) )
djm@1749 1601 {
djm@1749 1602 /*
djm@1749 1603 * Since L2's are guranteed RW, failure indicates the page was not
djm@1749 1604 * shadowed, so ignore.
djm@1749 1605 */
djm@1749 1606 perfc_incrc(shadow_update_va_fail);
djm@1749 1607 }
djm@1749 1608
djm@1749 1609 /*
djm@1749 1610 * If we're in log-dirty mode then we need to note that we've updated
djm@1749 1611 * the PTE in the PT-holding page. We need the machine frame number
djm@1749 1612 * for this.
djm@1749 1613 */
cl349@2957 1614 if ( ed->mm.shadow_mode == SHM_logdirty )
kaf24@2673 1615 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
djm@1749 1616
cl349@2957 1617 check_pagetable(&ed->mm, ed->mm.pagetable, "va"); /* debug */
djm@1749 1618 }
djm@1749 1619
djm@1749 1620 deferred_ops = percpu_info[cpu].deferred_ops;
djm@1749 1621 percpu_info[cpu].deferred_ops = 0;
djm@1749 1622
djm@1749 1623 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
djm@1749 1624 unlikely(flags & UVMF_FLUSH_TLB) )
djm@1749 1625 local_flush_tlb();
djm@1749 1626 else if ( unlikely(flags & UVMF_INVLPG) )
djm@1749 1627 __flush_tlb_one(page_nr << PAGE_SHIFT);
djm@1749 1628
djm@1749 1629 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
djm@1749 1630 (void)map_ldt_shadow_page(0);
djm@1749 1631
cl349@3036 1632 UNLOCK_BIGLOCK(d);
cl349@3036 1633
djm@1749 1634 return err;
djm@1749 1635 }
djm@1749 1636
djm@1749 1637 int do_update_va_mapping_otherdomain(unsigned long page_nr,
djm@1749 1638 unsigned long val,
djm@1749 1639 unsigned long flags,
djm@1749 1640 domid_t domid)
djm@1749 1641 {
djm@1749 1642 unsigned int cpu = smp_processor_id();
djm@1749 1643 struct domain *d;
djm@1749 1644 int rc;
djm@1749 1645
cl349@2957 1646 if ( unlikely(!IS_PRIV(current->domain)) )
djm@1749 1647 return -EPERM;
djm@1749 1648
kaf24@2314 1649 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
djm@1749 1650 if ( unlikely(d == NULL) )
djm@1749 1651 {
djm@1749 1652 MEM_LOG("Unknown domain '%u'", domid);
djm@1749 1653 return -ESRCH;
djm@1749 1654 }
djm@1749 1655
djm@1749 1656 rc = do_update_va_mapping(page_nr, val, flags);
djm@1749 1657
djm@1749 1658 put_domain(d);
kaf24@2314 1659 percpu_info[cpu].foreign = NULL;
djm@1749 1660
djm@1749 1661 return rc;
djm@1749 1662 }
cl349@1879 1663
cl349@1879 1664
cl349@1921 1665
kaf24@2382 1666 /*************************
kaf24@2382 1667 * Writable Pagetables
kaf24@2382 1668 */
cl349@2093 1669
kaf24@2663 1670 ptwr_info_t ptwr_info[NR_CPUS];
cl349@1894 1671
kaf24@2097 1672 #ifdef VERBOSE
cl349@2496 1673 int ptwr_debug = 0x0;
kaf24@2654 1674 #define PTWR_PRINTK(_f, _a...) \
kaf24@2654 1675 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
cl349@2652 1676 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
cl349@2093 1677 #else
kaf24@2654 1678 #define PTWR_PRINTK(_f, _a...) ((void)0)
cl349@2093 1679 #endif
cl349@1879 1680
kaf24@2663 1681 /* Flush the given writable p.t. page and write-protect it again. */
cl349@2512 1682 void ptwr_flush(const int which)
cl349@1879 1683 {
kaf24@2663 1684 unsigned long sstat, spte, pte, *ptep, l1va;
kaf24@2663 1685 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
kaf24@3022 1686 l2_pgentry_t *pl2e;
kaf24@2663 1687 int i, cpu = smp_processor_id();
cl349@2957 1688 struct exec_domain *ed = current;
cl349@2957 1689 struct domain *d = ed->domain;
cl349@1879 1690
iap10@2640 1691 l1va = ptwr_info[cpu].ptinfo[which].l1va;
cl349@2644 1692 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
cl349@1913 1693
kaf24@2663 1694 /*
kaf24@2663 1695 * STEP 1. Write-protect the p.t. page so no more updates can occur.
kaf24@2663 1696 */
kaf24@2663 1697
kaf24@2663 1698 if ( unlikely(__get_user(pte, ptep)) )
kaf24@2663 1699 {
cl349@2512 1700 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
kaf24@2707 1701 /*
kaf24@2707 1702 * Really a bug. We could read this PTE during the initial fault,
kaf24@2841 1703 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
kaf24@2707 1704 */
kaf24@2707 1705 BUG();
cl349@2414 1706 }
kaf24@2654 1707 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
kaf24@2654 1708 PTWR_PRINT_WHICH, ptep, pte);
cl349@2631 1709 pte &= ~_PAGE_RW;
iap10@2640 1710
cl349@2957 1711 if ( unlikely(ed->mm.shadow_mode) )
kaf24@2663 1712 {
kaf24@2663 1713 /* Write-protect the p.t. page in the shadow page table. */
cl349@2957 1714 l1pte_propagate_from_guest(&ed->mm, &pte, &spte);
kaf24@2663 1715 __put_user(
kaf24@2663 1716 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
kaf24@2663 1717
kaf24@2663 1718 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
cl349@2957 1719 sstat = get_shadow_status(&ed->mm, pte >> PAGE_SHIFT);
kaf24@2663 1720 if ( sstat & PSH_shadowed )
kaf24@2663 1721 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
iap10@2640 1722 }
iap10@2640 1723
kaf24@2663 1724 /* Write-protect the p.t. page in the guest page table. */
kaf24@2663 1725 if ( unlikely(__put_user(pte, ptep)) )
kaf24@2663 1726 {
cl349@2512 1727 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
kaf24@2707 1728 /*
kaf24@2707 1729 * Really a bug. We could write this PTE during the initial fault,
kaf24@2841 1730 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
kaf24@2707 1731 */
kaf24@2707 1732 BUG();
cl349@2414 1733 }
kaf24@2663 1734
kaf24@2663 1735 /* Ensure that there are no stale writable mappings in any TLB. */
kaf24@2841 1736 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
cl349@3325 1737 #if 1
kaf24@2841 1738 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
cl349@3036 1739 #else
cl349@3036 1740 flush_tlb_all();
cl349@3036 1741 #endif
kaf24@2654 1742 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
kaf24@2654 1743 PTWR_PRINT_WHICH, ptep, pte);
cl349@2631 1744
kaf24@2663 1745 /*
kaf24@2663 1746 * STEP 2. Validate any modified PTEs.
kaf24@2663 1747 */
kaf24@2663 1748
cl349@2631 1749 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
kaf24@2663 1750 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2663 1751 {
cl349@2631 1752 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
cl349@2631 1753 nl1e = pl1e[i];
kaf24@2663 1754
kaf24@2663 1755 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
cl349@2631 1756 continue;
kaf24@2663 1757
kaf24@2663 1758 /*
kaf24@2663 1759 * Fast path for PTEs that have merely been write-protected
kaf24@2663 1760 * (e.g., during a Unix fork()). A strict reduction in privilege.
kaf24@2663 1761 */
kaf24@2663 1762 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
kaf24@2663 1763 {
kaf24@2663 1764 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
kaf24@2663 1765 {
kaf24@2663 1766 if ( unlikely(sl1e != NULL) )
kaf24@2673 1767 l1pte_propagate_from_guest(
cl349@2957 1768 &ed->mm, &l1_pgentry_val(nl1e),
kaf24@2663 1769 &l1_pgentry_val(sl1e[i]));
kaf24@2663 1770 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
kaf24@2663 1771 }
cl349@2644 1772 continue;
kaf24@2663 1773 }
kaf24@2663 1774
kaf24@2663 1775 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
kaf24@2663 1776 {
cl349@2631 1777 MEM_LOG("ptwr: Could not re-validate l1 page\n");
kaf24@2707 1778 /*
kaf24@2707 1779 * Make the remaining p.t's consistent before crashing, so the
kaf24@2707 1780 * reference counts are correct.
kaf24@2707 1781 */
kaf24@2707 1782 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
kaf24@2707 1783 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
cl349@2708 1784 unmap_domain_mem(pl1e);
cl349@2708 1785 ptwr_info[cpu].ptinfo[which].l1va = 0;
cl349@3036 1786 UNLOCK_BIGLOCK(d);
cl349@2631 1787 domain_crash();
cl349@2631 1788 }
kaf24@2663 1789
kaf24@2663 1790 if ( unlikely(sl1e != NULL) )
kaf24@2673 1791 l1pte_propagate_from_guest(
cl349@2957 1792 &ed->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
kaf24@2663 1793
kaf24@2663 1794 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
kaf24@2663 1795 put_page_from_l1e(ol1e, d);
cl349@2631 1796 }
cl349@2631 1797 unmap_domain_mem(pl1e);
cl349@2631 1798
kaf24@2663 1799 /*
kaf24@2663 1800 * STEP 3. Reattach the L1 p.t. page into the current address space.
kaf24@2663 1801 */
kaf24@2663 1802
cl349@2957 1803 if ( (which == PTWR_PT_ACTIVE) && likely(!ed->mm.shadow_mode) )
kaf24@2663 1804 {
kaf24@2663 1805 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
kaf24@3022 1806 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
cl349@2631 1807 }
iap10@2509 1808
kaf24@2663 1809 /*
kaf24@2663 1810 * STEP 4. Final tidy-up.
kaf24@2663 1811 */
iap10@2509 1812
cl349@2512 1813 ptwr_info[cpu].ptinfo[which].l1va = 0;
kaf24@2663 1814
kaf24@2663 1815 if ( unlikely(sl1e != NULL) )
kaf24@2663 1816 {
kaf24@2663 1817 unmap_domain_mem(sl1e);
cl349@2957 1818 put_shadow_status(&ed->mm);
kaf24@2663 1819 }
cl349@1879 1820 }
cl349@1879 1821
kaf24@2663 1822 /* Write page fault handler: check if guest is trying to modify a PTE. */
cl349@1879 1823 int ptwr_do_page_fault(unsigned long addr)
cl349@1879 1824 {
kaf24@3022 1825 unsigned long pte, pfn, l2e;
cl349@1879 1826 struct pfn_info *page;
kaf24@3022 1827 l2_pgentry_t *pl2e;
kaf24@2663 1828 int which, cpu = smp_processor_id();
kaf24@2663 1829 u32 l2_idx;
iap10@2458 1830
kaf24@2663 1831 /*
kaf24@2663 1832 * Attempt to read the PTE that maps the VA being accessed. By checking for
kaf24@2663 1833 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
kaf24@2663 1834 */
kaf24@2663 1835 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
kaf24@2663 1836 _PAGE_PRESENT) ||
kaf24@2663 1837 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
cl349@3036 1838 {
kaf24@2663 1839 return 0;
cl349@3036 1840 }
iap10@2509 1841
kaf24@2663 1842 pfn = pte >> PAGE_SHIFT;
kaf24@2663 1843 page = &frame_table[pfn];
cl349@1915 1844
kaf24@2663 1845 /* We are looking only for read-only mappings of p.t. pages. */
kaf24@2663 1846 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
kaf24@2663 1847 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
cl349@3036 1848 {
kaf24@2663 1849 return 0;
cl349@3036 1850 }
kaf24@2663 1851
kaf24@2663 1852 /* Get the L2 index at which this L1 p.t. is always mapped. */
kaf24@2663 1853 l2_idx = page->u.inuse.type_info & PGT_va_mask;
kaf24@2663 1854 if ( unlikely(l2_idx >= PGT_va_unknown) )
cl349@3036 1855 {
kaf24@2663 1856 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
cl349@3036 1857 }
kaf24@2663 1858 l2_idx >>= PGT_va_shift;
kaf24@3022 1859
kaf24@3022 1860 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
kaf24@3022 1861 {
kaf24@3022 1862 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
kaf24@3022 1863 domain_crash();
kaf24@3022 1864 }
kaf24@3022 1865
kaf24@2663 1866 /*
kaf24@2663 1867 * Is the L1 p.t. mapped into the current address space? If so we call it
kaf24@2663 1868 * an ACTIVE p.t., otherwise it is INACTIVE.
kaf24@2663 1869 */
kaf24@2663 1870 pl2e = &linear_l2_table[l2_idx];
kaf24@3022 1871 l2e = l2_pgentry_val(*pl2e);
kaf24@3022 1872 which = PTWR_PT_INACTIVE;
kaf24@3022 1873 if ( (l2e >> PAGE_SHIFT) == pfn )
kaf24@3022 1874 {
cl349@3179 1875 /* Check the PRESENT bit to set ACTIVE. */
kaf24@3022 1876 if ( likely(l2e & _PAGE_PRESENT) )
kaf24@3022 1877 which = PTWR_PT_ACTIVE;
cl349@3179 1878 else {
cl349@3179 1879 /*
cl349@3179 1880 * If the PRESENT bit is clear, we may be conflicting with
cl349@3179 1881 * the current ACTIVE p.t. (it may be the same p.t. mapped
cl349@3179 1882 * at another virt addr).
cl349@3179 1883 * The ptwr_flush call below will restore the PRESENT bit.
cl349@3179 1884 */
cl349@3179 1885 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
cl349@3179 1886 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
cl349@3179 1887 which = PTWR_PT_ACTIVE;
cl349@3179 1888 }
kaf24@3022 1889 }
kaf24@2663 1890
kaf24@2663 1891 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
kaf24@2663 1892 "pfn %08lx\n", PTWR_PRINT_WHICH,
kaf24@2663 1893 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
kaf24@2663 1894
kaf24@2663 1895 /*
kaf24@2663 1896 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
kaf24@2663 1897 * time. If there is already one, we must flush it out.
kaf24@2663 1898 */
kaf24@2663 1899 if ( ptwr_info[cpu].ptinfo[which].l1va )
kaf24@2663 1900 ptwr_flush(which);
iap10@2507 1901
kaf24@2663 1902 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
kaf24@2663 1903 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
kaf24@2663 1904
kaf24@2663 1905 /* For safety, disconnect the L1 p.t. page from current space. */
kaf24@2663 1906 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
kaf24@2663 1907 {
kaf24@3022 1908 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
cl349@3325 1909 #if 1
kaf24@2841 1910 flush_tlb(); /* XXX Multi-CPU guests? */
cl349@3036 1911 #else
cl349@3036 1912 flush_tlb_all();
cl349@3036 1913 #endif
cl349@1879 1914 }
kaf24@2663 1915
kaf24@2663 1916 /* Temporarily map the L1 page, and make a copy of it. */
kaf24@2663 1917 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
kaf24@2663 1918 memcpy(ptwr_info[cpu].ptinfo[which].page,
kaf24@2663 1919 ptwr_info[cpu].ptinfo[which].pl1e,
kaf24@2663 1920 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
kaf24@2663 1921
kaf24@2663 1922 /* Finally, make the p.t. page writable by the guest OS. */
kaf24@2663 1923 pte |= _PAGE_RW;
kaf24@2663 1924 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
kaf24@2663 1925 &linear_pg_table[addr>>PAGE_SHIFT], pte);
kaf24@2663 1926 if ( unlikely(__put_user(pte, (unsigned long *)
kaf24@2663 1927 &linear_pg_table[addr>>PAGE_SHIFT])) )
kaf24@2663 1928 {
kaf24@2663 1929 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
kaf24@2663 1930 &linear_pg_table[addr>>PAGE_SHIFT]);
kaf24@2707 1931 /* Toss the writable pagetable state and crash. */
kaf24@2707 1932 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
kaf24@2707 1933 ptwr_info[cpu].ptinfo[which].l1va = 0;
kaf24@2663 1934 domain_crash();
kaf24@2663 1935 }
kaf24@2663 1936
kaf24@3090 1937 return EXCRET_fault_fixed;
cl349@1879 1938 }
cl349@1894 1939
kaf24@2504 1940 static __init int ptwr_init(void)
kaf24@2504 1941 {
kaf24@2504 1942 int i;
kaf24@2504 1943
kaf24@2504 1944 for ( i = 0; i < smp_num_cpus; i++ )
kaf24@2504 1945 {
cl349@2512 1946 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
cl349@2512 1947 (void *)alloc_xenheap_page();
cl349@2512 1948 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
cl349@2512 1949 (void *)alloc_xenheap_page();
kaf24@2504 1950 }
kaf24@2504 1951
kaf24@2504 1952 return 0;
kaf24@2504 1953 }
kaf24@2504 1954 __initcall(ptwr_init);
kaf24@2504 1955
kaf24@2663 1956
kaf24@2663 1957
kaf24@2663 1958
kaf24@2663 1959 /************************************************************************/
kaf24@2663 1960 /************************************************************************/
kaf24@2663 1961 /************************************************************************/
kaf24@2663 1962
cl349@2092 1963 #ifndef NDEBUG
kaf24@2663 1964
cl349@1894 1965 void ptwr_status(void)
cl349@1894 1966 {
cl349@2512 1967 unsigned long pte, *ptep, pfn;
cl349@1894 1968 struct pfn_info *page;
cl349@1894 1969 int cpu = smp_processor_id();
cl349@1894 1970
cl349@2512 1971 ptep = (unsigned long *)&linear_pg_table
cl349@2512 1972 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
kaf24@2237 1973
cl349@2512 1974 if ( __get_user(pte, ptep) ) {
cl349@2512 1975 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
cl349@2495 1976 domain_crash();
cl349@1894 1977 }
cl349@1894 1978
cl349@2495 1979 pfn = pte >> PAGE_SHIFT;
cl349@2495 1980 page = &frame_table[pfn];
cl349@2495 1981 printk("need to alloc l1 page %p\n", page);
cl349@2495 1982 /* make pt page writable */
cl349@2495 1983 printk("need to make read-only l1-page at %p is %08lx\n",
cl349@2512 1984 ptep, pte);
cl349@2495 1985
cl349@2512 1986 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
cl349@1894 1987 return;
cl349@1894 1988
cl349@2512 1989 if ( __get_user(pte, (unsigned long *)
cl349@2512 1990 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
cl349@2491 1991 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
cl349@2512 1992 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
cl349@2491 1993 domain_crash();
cl349@2414 1994 }
cl349@1894 1995 pfn = pte >> PAGE_SHIFT;
cl349@1894 1996 page = &frame_table[pfn];
cl349@1894 1997 }
iap10@2479 1998
kaf24@2637 1999 void audit_domain(struct domain *d)
iap10@2479 2000 {
iap10@2595 2001 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
kaf24@2637 2002
kaf24@2637 2003 void adjust (struct pfn_info *page, int dir, int adjtype)
iap10@2479 2004 {
cl349@2491 2005 int count = page->count_info & PGC_count_mask;
iap10@2479 2006
cl349@2491 2007 if ( adjtype )
cl349@2491 2008 {
cl349@2491 2009 int tcount = page->u.inuse.type_info & PGT_count_mask;
cl349@2491 2010
cl349@2491 2011 ttot++;
iap10@2479 2012
cl349@2491 2013 tcount += dir;
iap10@2479 2014
cl349@2491 2015 if ( tcount < 0 )
cl349@2491 2016 {
cl349@2644 2017 /* This will only come out once. */
kaf24@2637 2018 printk("Audit %d: type count whent below zero pfn=%x "
kaf24@2637 2019 "taf=%x otaf=%x\n",
kaf24@2748 2020 d->id, page-frame_table,
cl349@2491 2021 page->u.inuse.type_info,
cl349@2491 2022 page->tlbflush_timestamp);
cl349@2491 2023 }
cl349@2491 2024
cl349@2491 2025 page->u.inuse.type_info =
iap10@2573 2026 (page->u.inuse.type_info & ~PGT_count_mask) |
cl349@2644 2027 (tcount & PGT_count_mask);
cl349@2491 2028 }
iap10@2479 2029
cl349@2491 2030 ctot++;
cl349@2491 2031 count += dir;
cl349@2491 2032 if ( count < 0 )
cl349@2491 2033 {
cl349@2644 2034 /* This will only come out once. */
kaf24@2637 2035 printk("Audit %d: general count whent below zero pfn=%x "
kaf24@2637 2036 "taf=%x otaf=%x\n",
kaf24@2748 2037 d->id, page-frame_table,
cl349@2491 2038 page->u.inuse.type_info,
cl349@2491 2039 page->tlbflush_timestamp);
cl349@2491 2040 }
cl349@2491 2041
cl349@2491 2042 page->count_info =
iap10@2573 2043 (page->count_info & ~PGC_count_mask) |
cl349@2644 2044 (count & PGC_count_mask);
iap10@2479 2045
iap10@2479 2046 }
iap10@2479 2047
kaf24@2637 2048 void scan_for_pfn(struct domain *d, unsigned long xpfn)
iap10@2479 2049 {
kaf24@2637 2050 unsigned long pfn, *pt;
cl349@2491 2051 struct list_head *list_ent;
kaf24@2637 2052 struct pfn_info *page;
cl349@2491 2053 int i;
iap10@2479 2054
iap10@2479 2055 list_ent = d->page_list.next;
cl349@2491 2056 for ( i = 0; (list_ent != &d->page_list); i++ )
cl349@2491 2057 {
cl349@2491 2058 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2059 page = &frame_table[pfn];
cl349@2491 2060
kaf24@2637 2061 switch ( page->u.inuse.type_info & PGT_type_mask )
cl349@2491 2062 {
kaf24@2637 2063 case PGT_l1_page_table:
kaf24@2637 2064 case PGT_l2_page_table:
kaf24@2637 2065 pt = map_domain_mem(pfn<<PAGE_SHIFT);
cl349@2491 2066 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2637 2067 if ( (pt[i] & _PAGE_PRESENT) &&
kaf24@2637 2068 ((pt[i] >> PAGE_SHIFT) == xpfn) )
kaf24@2637 2069 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
kaf24@2748 2070 d->id, i, pfn, page->u.inuse.type_info,
kaf24@2637 2071 page->count_info);
cl349@2491 2072 unmap_domain_mem(pt);
cl349@2491 2073 }
iap10@2479 2074
cl349@2491 2075 list_ent = frame_table[pfn].list.next;
cl349@2491 2076 }
iap10@2479 2077
iap10@2479 2078 }
iap10@2479 2079
kaf24@2637 2080 void scan_for_pfn_remote(unsigned long xpfn)
iap10@2479 2081 {
cl349@2491 2082 struct domain *e;
cl349@2491 2083 for_each_domain ( e )
cl349@2491 2084 scan_for_pfn( e, xpfn );
iap10@2479 2085 }
iap10@2479 2086
iap10@2479 2087 int i;
iap10@2479 2088 unsigned long pfn;
iap10@2479 2089 struct list_head *list_ent;
kaf24@2637 2090 struct pfn_info *page;
iap10@2479 2091
cl349@3036 2092 if ( d != current->domain )
cl349@2491 2093 domain_pause(d);
iap10@2479 2094 synchronise_pagetables(~0UL);
iap10@2479 2095
iap10@2479 2096 printk("pt base=%lx sh_info=%x\n",
cl349@3036 2097 pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT,
cl349@2491 2098 virt_to_page(d->shared_info)-frame_table);
cl349@2491 2099
iap10@2479 2100 spin_lock(&d->page_alloc_lock);
iap10@2479 2101
kaf24@2637 2102 /* PHASE 0 */
iap10@2479 2103
iap10@2479 2104 list_ent = d->page_list.next;
iap10@2479 2105 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2106 {
cl349@2491 2107 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2108 page = &frame_table[pfn];
iap10@2479 2109
cl349@2491 2110 if ( page->u.inuse.domain != d )
cl349@2491 2111 BUG();
iap10@2479 2112
cl349@2491 2113 if ( (page->u.inuse.type_info & PGT_count_mask) >
cl349@2491 2114 (page->count_info & PGC_count_mask) )
cl349@2491 2115 printk("taf > caf %x %x pfn=%lx\n",
cl349@2491 2116 page->u.inuse.type_info, page->count_info, pfn );
iap10@2479 2117
kaf24@2637 2118 #if 0 /* SYSV shared memory pages plus writeable files. */
cl349@2491 2119 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
cl349@2491 2120 (page->u.inuse.type_info & PGT_count_mask) > 1 )
cl349@2491 2121 {
cl349@2491 2122 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
cl349@2491 2123 pfn,
cl349@2491 2124 page->u.inuse.type_info,
cl349@2491 2125 page->count_info );
cl349@2491 2126 scan_for_pfn_remote(pfn);
cl349@2491 2127 }
cl349@2092 2128 #endif
cl349@2491 2129 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
cl349@2491 2130 (page->u.inuse.type_info & PGT_count_mask) > 1 )
cl349@2491 2131 {
cl349@2491 2132 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
cl349@2491 2133 pfn,
cl349@2491 2134 page->u.inuse.type_info,
cl349@2491 2135 page->count_info );
cl349@2491 2136 }
iap10@2479 2137
kaf24@2637 2138 /* Use tlbflush_timestamp to store original type_info. */
cl349@2491 2139 page->tlbflush_timestamp = page->u.inuse.type_info;
iap10@2479 2140
cl349@2491 2141 list_ent = frame_table[pfn].list.next;
iap10@2479 2142 }
iap10@2479 2143
iap10@2479 2144
kaf24@2637 2145 /* PHASE 1 */
iap10@2479 2146
cl349@3036 2147 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], -1, 1);
iap10@2479 2148
iap10@2479 2149 list_ent = d->page_list.next;
iap10@2479 2150 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2151 {
kaf24@2637 2152 unsigned long *pt;
cl349@2491 2153 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2154 page = &frame_table[pfn];
iap10@2479 2155
cl349@2491 2156 if ( page->u.inuse.domain != d )
cl349@2491 2157 BUG();
iap10@2479 2158
cl349@2491 2159 switch ( page->u.inuse.type_info & PGT_type_mask )
cl349@2491 2160 {
cl349@2491 2161 case PGT_l2_page_table:
iap10@2479 2162
cl349@2491 2163 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
cl349@2491 2164 printk("Audit %d: L2 not validated %x\n",
kaf24@2748 2165 d->id, page->u.inuse.type_info);
iap10@2479 2166
cl349@2491 2167 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
cl349@2491 2168 printk("Audit %d: L2 not pinned %x\n",
kaf24@2748 2169 d->id, page->u.inuse.type_info);
cl349@2491 2170 else
cl349@2491 2171 adjust( page, -1, 1 );
cl349@2491 2172
cl349@2491 2173 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2174
cl349@2491 2175 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
cl349@2491 2176 {
cl349@2491 2177 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2178 {
cl349@2491 2179 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2180 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2181
cl349@2491 2182 if ( l1page->u.inuse.domain != d )
cl349@2491 2183 {
kaf24@2637 2184 printk("L2: Skip bizarre page belonging to other "
kaf24@2637 2185 "dom %p\n", l1page->u.inuse.domain);
cl349@2491 2186 continue;
cl349@2491 2187 }
kaf24@2637 2188
kaf24@2637 2189 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
kaf24@2637 2190 PGT_l2_page_table )
kaf24@2637 2191 printk("Audit %d: [%x] Found %s Linear PT "
kaf24@2748 2192 "t=%x pfn=%lx\n", d->id, i,
kaf24@2637 2193 (l1pfn==pfn) ? "Self" : "Other",
kaf24@2637 2194 l1page->u.inuse.type_info,
kaf24@2637 2195 l1pfn);
kaf24@2637 2196 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
kaf24@2637 2197 PGT_l1_page_table )
kaf24@2637 2198 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
kaf24@2748 2199 d->id, i,
kaf24@2637 2200 l1page->u.inuse.type_info,
kaf24@2637 2201 l1pfn);
iap10@2479 2202
kaf24@2637 2203 adjust(l1page, -1, 1);
cl349@2491 2204 }
cl349@2491 2205 }
iap10@2479 2206
cl349@2491 2207 unmap_domain_mem(pt);
iap10@2479 2208
cl349@2491 2209 break;
iap10@2479 2210
iap10@2479 2211
cl349@2491 2212 case PGT_l1_page_table:
cl349@2491 2213
cl349@2491 2214 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
cl349@2491 2215 adjust( page, -1, 1 );
iap10@2479 2216
cl349@2491 2217 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
cl349@2491 2218 printk("Audit %d: L1 not validated %x\n",
kaf24@2748 2219 d->id, page->u.inuse.type_info);
iap10@2479 2220 #if 0
cl349@2491 2221 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
cl349@2491 2222 printk("Audit %d: L1 not pinned %x\n",
kaf24@2748 2223 d->id, page->u.inuse.type_info);
iap10@2479 2224 #endif
cl349@2491 2225 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2226
cl349@2491 2227 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
cl349@2491 2228 {
cl349@2491 2229 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2230 {
cl349@2491 2231 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2232 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2233
cl349@2644 2234 if ( l1pfn < 0x100 )
cl349@2644 2235 {
cl349@2644 2236 lowmem_mappings++;
cl349@2644 2237 continue;
cl349@2644 2238 }
iap10@2595 2239
cl349@2644 2240 if ( l1pfn > max_page )
cl349@2644 2241 {
cl349@2644 2242 io_mappings++;
cl349@2644 2243 continue;
cl349@2644 2244 }
iap10@2595 2245
cl349@2491 2246 if ( pt[i] & _PAGE_RW )
cl349@2491 2247 {
iap10@2479 2248
cl349@2491 2249 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
cl349@2491 2250 PGT_l1_page_table ||
cl349@2491 2251 (l1page->u.inuse.type_info & PGT_type_mask) ==
cl349@2491 2252 PGT_l2_page_table )
cl349@2491 2253 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
kaf24@2748 2254 d->id, i,
cl349@2491 2255 l1page->u.inuse.type_info,
cl349@2491 2256 l1pfn);
iap10@2479 2257
cl349@2491 2258 }
iap10@2479 2259
cl349@2491 2260 if ( l1page->u.inuse.domain != d )
cl349@2491 2261 {
kaf24@2637 2262 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
kaf24@2637 2263 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
kaf24@2748 2264 d->id, pfn, i,
iap10@2595 2265 (unsigned long)l1page->u.inuse.domain,
cl349@2644 2266 l1pfn,
cl349@2644 2267 l1page->count_info,
cl349@2644 2268 l1page->u.inuse.type_info,
cl349@2644 2269 machine_to_phys_mapping[l1pfn]);
cl349@2491 2270 continue;
cl349@2491 2271 }
iap10@2479 2272
kaf24@2637 2273 adjust(l1page, -1, 0);
cl349@2491 2274 }
cl349@2491 2275 }
iap10@2479 2276
cl349@2491 2277 unmap_domain_mem(pt);
iap10@2479 2278
cl349@2491 2279 break;
iap10@2595 2280 }
iap10@2479 2281
cl349@2491 2282 list_ent = frame_table[pfn].list.next;
iap10@2479 2283 }
iap10@2479 2284
kaf24@2637 2285 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
cl349@2644 2286 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
kaf24@2748 2287 d->id, lowmem_mappings, io_mappings);
iap10@2595 2288
kaf24@2637 2289 /* PHASE 2 */
iap10@2479 2290
iap10@2479 2291 ctot = ttot = 0;
iap10@2479 2292 list_ent = d->page_list.next;
iap10@2479 2293 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2294 {
cl349@2491 2295 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2296 page = &frame_table[pfn];
iap10@2479 2297
cl349@2491 2298 switch ( page->u.inuse.type_info & PGT_type_mask)
cl349@2491 2299 {
cl349@2491 2300 case PGT_l1_page_table:
cl349@2491 2301 case PGT_l2_page_table:
cl349@2491 2302 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
cl349@2491 2303 {
cl349@2491 2304 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
kaf24@2748 2305 d->id, page->u.inuse.type_info,
cl349@2491 2306 page->tlbflush_timestamp,
cl349@2491 2307 page->count_info, pfn );
cl349@2491 2308 scan_for_pfn_remote(pfn);
cl349@2491 2309 }
cl349@2491 2310 default:
cl349@2491 2311 if ( (page->count_info & PGC_count_mask) != 1 )
cl349@2491 2312 {
kaf24@2637 2313 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
kaf24@2748 2314 d->id,
cl349@2491 2315 page->count_info,
cl349@2491 2316 page->u.inuse.type_info,
cl349@2491 2317 page->tlbflush_timestamp, pfn );
cl349@2491 2318 scan_for_pfn_remote(pfn);
cl349@2491 2319 }
cl349@2491 2320 break;
cl349@2491 2321 }
iap10@2479 2322
cl349@2491 2323 list_ent = frame_table[pfn].list.next;
iap10@2479 2324 }
iap10@2479 2325
kaf24@2637 2326 /* PHASE 3 */
iap10@2479 2327
iap10@2479 2328 list_ent = d->page_list.next;
iap10@2479 2329 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2330 {
kaf24@2637 2331 unsigned long *pt;
cl349@2491 2332 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2333 page = &frame_table[pfn];
iap10@2479 2334
cl349@2491 2335 switch ( page->u.inuse.type_info & PGT_type_mask )
cl349@2491 2336 {
cl349@2491 2337 case PGT_l2_page_table:
cl349@2491 2338 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
cl349@2491 2339 adjust( page, 1, 1 );
iap10@2479 2340
cl349@2491 2341 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2342
cl349@2491 2343 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
cl349@2491 2344 {
cl349@2491 2345 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2346 {
cl349@2491 2347 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2348 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2349
cl349@2491 2350 if ( l1page->u.inuse.domain == d)
kaf24@2637 2351 adjust(l1page, 1, 1);
cl349@2491 2352 }
cl349@2491 2353 }
iap10@2479 2354
cl349@2491 2355 unmap_domain_mem(pt);
cl349@2491 2356 break;
iap10@2479 2357
cl349@2491 2358 case PGT_l1_page_table:
cl349@2491 2359 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
cl349@2491 2360 adjust( page, 1, 1 );
iap10@2479 2361
cl349@2491 2362 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2363
cl349@2491 2364 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
cl349@2491 2365 {
cl349@2491 2366 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2367 {
cl349@2491 2368 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2369 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2370
kaf24@2637 2371 if ( (l1page->u.inuse.domain != d) ||
kaf24@2637 2372 (l1pfn < 0x100) || (l1pfn > max_page) )
kaf24@2637 2373 continue;
iap10@2595 2374
cl349@2644 2375 adjust(l1page, 1, 0);
cl349@2491 2376 }
cl349@2491 2377 }
iap10@2479 2378
cl349@2491 2379 unmap_domain_mem(pt);
cl349@2491 2380 break;
cl349@2491 2381 }
iap10@2479 2382
iap10@2479 2383
kaf24@2637 2384 page->tlbflush_timestamp = 0;
iap10@2479 2385
cl349@2491 2386 list_ent = frame_table[pfn].list.next;
iap10@2479 2387 }
iap10@2479 2388
iap10@2479 2389 spin_unlock(&d->page_alloc_lock);
iap10@2479 2390
cl349@3036 2391 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], 1, 1);
iap10@2479 2392
kaf24@2748 2393 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
iap10@2479 2394
cl349@3036 2395 if ( d != current->domain )
cl349@2491 2396 domain_unpause(d);
iap10@2479 2397 }
iap10@2479 2398
cl349@2491 2399 void audit_domains(void)
iap10@2479 2400 {
iap10@2479 2401 struct domain *d;
iap10@2479 2402 for_each_domain ( d )
cl349@2644 2403 audit_domain(d);
iap10@2479 2404 }
iap10@2479 2405
kaf24@2842 2406 void audit_domains_key(unsigned char key)
iap10@2479 2407 {
kaf24@2842 2408 audit_domains();
iap10@2479 2409 }
iap10@2479 2410
iap10@2479 2411 #endif