debuggers.hg

annotate xen/arch/x86/memory.c @ 3632:fec8b1778268

bitkeeper revision 1.1159.212.60 (41febc4bKKSkh9u-Zes9v2CmBuLZxA)

More bootstrap fixes for x86/64. Next thing to do is sort out the IDT and
get traps.c working; then we can get rid of a bunch of dummy labels from
end of boot/x86_64.S. We're also going to need some kind of entry.S before
we can safely enable interrupts. Also bear in mind that not all of physical
RAM may be mapped (only first 1GB) and no m2p table is yet allocated or
mapped. Plenty to be done!
author kaf24@viper.(none)
date Mon Jan 31 23:16:27 2005 +0000 (2005-01-31)
parents 6d98eb831816
children 9a9c5a491401 e6af5d8f8b39 fd1dd0663b09
rev   line source
djm@1749 1 /******************************************************************************
djm@1749 2 * arch/x86/memory.c
djm@1749 3 *
djm@1749 4 * Copyright (c) 2002-2004 K A Fraser
cl349@2093 5 * Copyright (c) 2004 Christian Limpach
djm@1749 6 *
djm@1749 7 * This program is free software; you can redistribute it and/or modify
djm@1749 8 * it under the terms of the GNU General Public License as published by
djm@1749 9 * the Free Software Foundation; either version 2 of the License, or
djm@1749 10 * (at your option) any later version.
djm@1749 11 *
djm@1749 12 * This program is distributed in the hope that it will be useful,
djm@1749 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
djm@1749 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
djm@1749 15 * GNU General Public License for more details.
djm@1749 16 *
djm@1749 17 * You should have received a copy of the GNU General Public License
djm@1749 18 * along with this program; if not, write to the Free Software
djm@1749 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
djm@1749 20 */
djm@1749 21
djm@1749 22 /*
djm@1749 23 * A description of the x86 page table API:
djm@1749 24 *
djm@1749 25 * Domains trap to do_mmu_update with a list of update requests.
djm@1749 26 * This is a list of (ptr, val) pairs, where the requested operation
djm@1749 27 * is *ptr = val.
djm@1749 28 *
djm@1749 29 * Reference counting of pages:
djm@1749 30 * ----------------------------
djm@1749 31 * Each page has two refcounts: tot_count and type_count.
djm@1749 32 *
djm@1749 33 * TOT_COUNT is the obvious reference count. It counts all uses of a
djm@1749 34 * physical page frame by a domain, including uses as a page directory,
djm@1749 35 * a page table, or simple mappings via a PTE. This count prevents a
djm@1749 36 * domain from releasing a frame back to the free pool when it still holds
djm@1749 37 * a reference to it.
djm@1749 38 *
djm@1749 39 * TYPE_COUNT is more subtle. A frame can be put to one of three
djm@1749 40 * mutually-exclusive uses: it might be used as a page directory, or a
kaf24@2375 41 * page table, or it may be mapped writable by the domain [of course, a
djm@1749 42 * frame may not be used in any of these three ways!].
djm@1749 43 * So, type_count is a count of the number of times a frame is being
djm@1749 44 * referred to in its current incarnation. Therefore, a page can only
djm@1749 45 * change its type when its type count is zero.
djm@1749 46 *
djm@1749 47 * Pinning the page type:
djm@1749 48 * ----------------------
djm@1749 49 * The type of a page can be pinned/unpinned with the commands
djm@1749 50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
djm@1749 51 * pinning is not reference counted, so it can't be nested).
djm@1749 52 * This is useful to prevent a page's type count falling to zero, at which
djm@1749 53 * point safety checks would need to be carried out next time the count
djm@1749 54 * is increased again.
djm@1749 55 *
kaf24@2375 56 * A further note on writable page mappings:
kaf24@2375 57 * -----------------------------------------
kaf24@2375 58 * For simplicity, the count of writable mappings for a page may not
kaf24@2375 59 * correspond to reality. The 'writable count' is incremented for every
djm@1749 60 * PTE which maps the page with the _PAGE_RW flag set. However, for
djm@1749 61 * write access to be possible the page directory entry must also have
djm@1749 62 * its _PAGE_RW bit set. We do not check this as it complicates the
djm@1749 63 * reference counting considerably [consider the case of multiple
djm@1749 64 * directory entries referencing a single page table, some with the RW
djm@1749 65 * bit set, others not -- it starts getting a bit messy].
djm@1749 66 * In normal use, this simplification shouldn't be a problem.
djm@1749 67 * However, the logic can be added if required.
djm@1749 68 *
djm@1749 69 * One more note on read-only page mappings:
djm@1749 70 * -----------------------------------------
djm@1749 71 * We want domains to be able to map pages for read-only access. The
djm@1749 72 * main reason is that page tables and directories should be readable
kaf24@2375 73 * by a domain, but it would not be safe for them to be writable.
djm@1749 74 * However, domains have free access to rings 1 & 2 of the Intel
djm@1749 75 * privilege model. In terms of page protection, these are considered
djm@1749 76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
djm@1749 77 * read-only restrictions are respected in supervisor mode -- if the
kaf24@2375 78 * bit is clear then any mapped page is writable.
djm@1749 79 *
djm@1749 80 * We get round this by always setting the WP bit and disallowing
djm@1749 81 * updates to it. This is very unlikely to cause a problem for guest
djm@1749 82 * OS's, which will generally use the WP bit to simplify copy-on-write
djm@1749 83 * implementation (in that case, OS wants a fault when it writes to
djm@1749 84 * an application-supplied buffer).
djm@1749 85 */
djm@1749 86
djm@1749 87 #include <xen/config.h>
djm@1749 88 #include <xen/init.h>
kaf24@3392 89 #include <xen/kernel.h>
djm@1749 90 #include <xen/lib.h>
djm@1749 91 #include <xen/mm.h>
djm@1749 92 #include <xen/sched.h>
djm@1749 93 #include <xen/errno.h>
djm@1749 94 #include <xen/perfc.h>
djm@1749 95 #include <xen/irq.h>
iap10@2479 96 #include <xen/softirq.h>
kaf24@1787 97 #include <asm/shadow.h>
djm@1749 98 #include <asm/page.h>
djm@1749 99 #include <asm/flushtlb.h>
djm@1749 100 #include <asm/io.h>
djm@1749 101 #include <asm/uaccess.h>
djm@1749 102 #include <asm/domain_page.h>
djm@1749 103 #include <asm/ldt.h>
djm@1749 104
kaf24@2097 105 #ifdef VERBOSE
djm@1749 106 #define MEM_LOG(_f, _a...) \
djm@1749 107 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
cl349@2957 108 current->domain->id , __LINE__ , ## _a )
djm@1749 109 #else
djm@1749 110 #define MEM_LOG(_f, _a...) ((void)0)
djm@1749 111 #endif
djm@1749 112
djm@1749 113 static int alloc_l2_table(struct pfn_info *page);
djm@1749 114 static int alloc_l1_table(struct pfn_info *page);
djm@1749 115 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
djm@1749 116 static int get_page_and_type_from_pagenr(unsigned long page_nr,
djm@1749 117 u32 type,
djm@1749 118 struct domain *d);
djm@1749 119
djm@1749 120 static void free_l2_table(struct pfn_info *page);
djm@1749 121 static void free_l1_table(struct pfn_info *page);
djm@1749 122
djm@1749 123 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
djm@1749 124 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
djm@1749 125
djm@1749 126 /* Used to defer flushing of memory structures. */
djm@1749 127 static struct {
djm@1749 128 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
djm@1749 129 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
kaf24@3187 130 unsigned long deferred_ops;
kaf24@2314 131 /* If non-NULL, specifies a foreign subject domain for some operations. */
kaf24@3187 132 struct domain *foreign;
kaf24@3113 133 } __cacheline_aligned percpu_info[NR_CPUS];
djm@1749 134
kaf24@2314 135 /*
kaf24@2314 136 * Returns the current foreign domain; defaults to the currently-executing
kaf24@2314 137 * domain if a foreign override hasn't been specified.
kaf24@2314 138 */
cl349@2957 139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
djm@1749 140
kaf24@2336 141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
kaf24@2336 142 static struct domain *dom_xen, *dom_io;
cl349@2227 143
kaf24@3392 144 /* Frame table and its size in pages. */
kaf24@3392 145 struct pfn_info *frame_table;
kaf24@3392 146 unsigned long frame_table_size;
kaf24@3392 147 unsigned long max_page;
kaf24@3392 148
kaf24@3392 149 void __init init_frametable(void)
kaf24@3392 150 {
kaf24@3392 151 unsigned long i, p;
kaf24@3392 152
kaf24@3632 153 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
kaf24@3392 154 frame_table_size = max_page * sizeof(struct pfn_info);
kaf24@3392 155 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
kaf24@3392 156
kaf24@3392 157 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
kaf24@3392 158 {
kaf24@3392 159 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
kaf24@3392 160 if ( p == 0 )
kaf24@3392 161 panic("Not enough memory for frame table\n");
kaf24@3632 162 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
kaf24@3632 163 4UL << 20, PAGE_HYPERVISOR);
kaf24@3392 164 }
kaf24@3392 165
kaf24@3392 166 memset(frame_table, 0, frame_table_size);
kaf24@3392 167 }
kaf24@3392 168
cl349@2227 169 void arch_init_memory(void)
djm@1749 170 {
sos22@3478 171 unsigned long i;
kaf24@2336 172
kaf24@2384 173 /*
kaf24@2384 174 * We are rather picky about the layout of 'struct pfn_info'. The
kaf24@2384 175 * count_info and domain fields must be adjacent, as we perform atomic
kaf24@2384 176 * 64-bit operations on them. Also, just for sanity, we assert the size
kaf24@2384 177 * of the structure here.
kaf24@2384 178 */
kaf24@2384 179 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
kaf24@2384 180 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
kaf24@2384 181 (sizeof(struct pfn_info) != 24) )
kaf24@2384 182 {
kaf24@2384 183 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
kaf24@2384 184 offsetof(struct pfn_info, count_info),
kaf24@2384 185 offsetof(struct pfn_info, u.inuse.domain),
kaf24@2384 186 sizeof(struct pfn_info));
kaf24@2384 187 for ( ; ; ) ;
kaf24@2384 188 }
kaf24@2384 189
djm@1749 190 memset(percpu_info, 0, sizeof(percpu_info));
cl349@2227 191
kaf24@2336 192 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
kaf24@2336 193 memset(machine_to_phys_mapping, 0x55, 4<<20);
kaf24@2336 194
kaf24@2336 195 /*
kaf24@2336 196 * Initialise our DOMID_XEN domain.
kaf24@2336 197 * Any Xen-heap pages that we will allow to be mapped will have
kaf24@2336 198 * their domain field set to dom_xen.
kaf24@2336 199 */
kaf24@2336 200 dom_xen = alloc_domain_struct();
kaf24@2336 201 atomic_set(&dom_xen->refcnt, 1);
kaf24@2748 202 dom_xen->id = DOMID_XEN;
kaf24@2336 203
kaf24@2336 204 /*
kaf24@2336 205 * Initialise our DOMID_IO domain.
kaf24@2336 206 * This domain owns no pages but is considered a special case when
kaf24@2336 207 * mapping I/O pages, as the mappings occur at the priv of the caller.
kaf24@2336 208 */
kaf24@2336 209 dom_io = alloc_domain_struct();
kaf24@2336 210 atomic_set(&dom_io->refcnt, 1);
kaf24@2748 211 dom_io->id = DOMID_IO;
kaf24@2336 212
kaf24@2336 213 /* M2P table is mappable read-only by privileged domains. */
kaf24@3392 214 for ( i = 0; i < 1024; i++ )
kaf24@2336 215 {
sos22@3478 216 frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
sos22@3478 217 /* gdt to make sure it's only mapped read-only by non-privileged
sos22@3478 218 domains. */
sos22@3478 219 frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
sos22@3478 220 frame_table[m2p_start_mfn+i].u.inuse.domain = dom_xen;
kaf24@2336 221 }
djm@1749 222 }
djm@1749 223
cl349@2957 224 static void __invalidate_shadow_ldt(struct exec_domain *d)
djm@1749 225 {
djm@1749 226 int i;
djm@1749 227 unsigned long pfn;
djm@1749 228 struct pfn_info *page;
djm@1749 229
djm@1749 230 d->mm.shadow_ldt_mapcnt = 0;
djm@1749 231
djm@1749 232 for ( i = 16; i < 32; i++ )
djm@1749 233 {
cl349@3036 234 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_ptes[i]);
djm@1749 235 if ( pfn == 0 ) continue;
cl349@3036 236 d->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
djm@1749 237 page = &frame_table[pfn];
djm@1749 238 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
cl349@3036 239 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
djm@1749 240 put_page_and_type(page);
djm@1749 241 }
djm@1749 242
djm@1749 243 /* Dispose of the (now possibly invalid) mappings from the TLB. */
djm@1749 244 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
djm@1749 245 }
djm@1749 246
djm@1749 247
cl349@2957 248 static inline void invalidate_shadow_ldt(struct exec_domain *d)
djm@1749 249 {
djm@1749 250 if ( d->mm.shadow_ldt_mapcnt != 0 )
djm@1749 251 __invalidate_shadow_ldt(d);
djm@1749 252 }
djm@1749 253
djm@1749 254
kaf24@2336 255 static int alloc_segdesc_page(struct pfn_info *page)
djm@1749 256 {
djm@1749 257 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
djm@1749 258 int i;
djm@1749 259
djm@1749 260 for ( i = 0; i < 512; i++ )
kaf24@1854 261 if ( unlikely(!check_descriptor(&descs[i*2])) )
djm@1749 262 goto fail;
djm@1749 263
djm@1749 264 unmap_domain_mem(descs);
djm@1749 265 return 1;
djm@1749 266
djm@1749 267 fail:
djm@1749 268 unmap_domain_mem(descs);
djm@1749 269 return 0;
djm@1749 270 }
djm@1749 271
djm@1749 272
djm@1749 273 /* Map shadow page at offset @off. */
djm@1749 274 int map_ldt_shadow_page(unsigned int off)
djm@1749 275 {
cl349@2957 276 struct exec_domain *ed = current;
cl349@2957 277 struct domain *d = ed->domain;
djm@1749 278 unsigned long l1e;
djm@1749 279
djm@1749 280 if ( unlikely(in_irq()) )
djm@1749 281 BUG();
djm@1749 282
cl349@2957 283 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->mm.ldt_base >>
djm@1749 284 PAGE_SHIFT) + off]);
djm@1749 285
djm@1749 286 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
djm@1749 287 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
djm@1749 288 d, PGT_ldt_page)) )
djm@1749 289 return 0;
djm@1749 290
cl349@3036 291 ed->mm.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
cl349@2957 292 ed->mm.shadow_ldt_mapcnt++;
djm@1749 293
djm@1749 294 return 1;
djm@1749 295 }
djm@1749 296
djm@1749 297
djm@1749 298 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
djm@1749 299 {
djm@1749 300 struct pfn_info *page = &frame_table[page_nr];
djm@1749 301
djm@1749 302 if ( unlikely(!pfn_is_ram(page_nr)) )
djm@1749 303 {
djm@1749 304 MEM_LOG("Pfn %08lx is not RAM", page_nr);
djm@1749 305 return 0;
djm@1749 306 }
djm@1749 307
djm@1749 308 if ( unlikely(!get_page(page, d)) )
djm@1749 309 {
djm@1749 310 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
djm@1749 311 return 0;
djm@1749 312 }
djm@1749 313
djm@1749 314 return 1;
djm@1749 315 }
djm@1749 316
djm@1749 317
djm@1749 318 static int get_page_and_type_from_pagenr(unsigned long page_nr,
djm@1749 319 u32 type,
djm@1749 320 struct domain *d)
djm@1749 321 {
djm@1749 322 struct pfn_info *page = &frame_table[page_nr];
djm@1749 323
djm@1749 324 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
djm@1749 325 return 0;
djm@1749 326
djm@1749 327 if ( unlikely(!get_page_type(page, type)) )
djm@1749 328 {
cl349@2450 329 #ifdef VERBOSE
cl349@2491 330 if ( (type & PGT_type_mask) != PGT_l1_page_table )
cl349@2491 331 MEM_LOG("Bad page type for pfn %08lx (%08x)",
cl349@2491 332 page_nr, page->u.inuse.type_info);
cl349@2450 333 #endif
djm@1749 334 put_page(page);
djm@1749 335 return 0;
djm@1749 336 }
djm@1749 337
djm@1749 338 return 1;
djm@1749 339 }
djm@1749 340
djm@1749 341
djm@1749 342 /*
djm@1749 343 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
djm@1749 344 * needs some special care with reference counst and access permissions:
djm@1749 345 * 1. The mapping entry must be read-only, or the guest may get write access
djm@1749 346 * to its own PTEs.
djm@1749 347 * 2. We must only bump the reference counts for an *already validated*
djm@1749 348 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
djm@1749 349 * on a validation that is required to complete that validation.
djm@1749 350 * 3. We only need to increment the reference counts for the mapped page
djm@1749 351 * frame if it is mapped by a different L2 table. This is sufficient and
djm@1749 352 * also necessary to allow validation of an L2 table mapping itself.
djm@1749 353 */
kaf24@2314 354 static int
kaf24@2314 355 get_linear_pagetable(
kaf24@2314 356 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
djm@1749 357 {
djm@1749 358 u32 x, y;
djm@1749 359 struct pfn_info *page;
djm@1749 360
djm@1749 361 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
djm@1749 362 {
djm@1749 363 MEM_LOG("Attempt to create linear p.t. with write perms");
djm@1749 364 return 0;
djm@1749 365 }
djm@1749 366
djm@1749 367 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
djm@1749 368 {
djm@1749 369 /* Make sure the mapped frame belongs to the correct domain. */
kaf24@2314 370 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
djm@1749 371 return 0;
djm@1749 372
djm@1749 373 /*
djm@1749 374 * Make sure that the mapped frame is an already-validated L2 table.
djm@1749 375 * If so, atomically increment the count (checking for overflow).
djm@1749 376 */
djm@1749 377 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
kaf24@1970 378 y = page->u.inuse.type_info;
djm@1749 379 do {
djm@1749 380 x = y;
djm@1749 381 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
djm@1749 382 unlikely((x & (PGT_type_mask|PGT_validated)) !=
djm@1749 383 (PGT_l2_page_table|PGT_validated)) )
djm@1749 384 {
djm@1749 385 put_page(page);
djm@1749 386 return 0;
djm@1749 387 }
djm@1749 388 }
kaf24@1970 389 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
djm@1749 390 }
djm@1749 391
djm@1749 392 return 1;
djm@1749 393 }
djm@1749 394
djm@1749 395
kaf24@2314 396 static int
kaf24@2314 397 get_page_from_l1e(
kaf24@2314 398 l1_pgentry_t l1e, struct domain *d)
djm@1749 399 {
djm@1749 400 unsigned long l1v = l1_pgentry_val(l1e);
djm@1749 401 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
kaf24@2382 402 struct pfn_info *page = &frame_table[pfn];
djm@1749 403 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
djm@1749 404
djm@1749 405 if ( !(l1v & _PAGE_PRESENT) )
djm@1749 406 return 1;
djm@1749 407
djm@1749 408 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
djm@1749 409 {
djm@1749 410 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
djm@1749 411 return 0;
djm@1749 412 }
djm@1749 413
djm@1749 414 if ( unlikely(!pfn_is_ram(pfn)) )
djm@1749 415 {
kaf24@2336 416 /* Revert to caller privileges if FD == DOMID_IO. */
kaf24@2336 417 if ( d == dom_io )
cl349@2957 418 d = current->domain;
kaf24@2336 419
kaf24@2336 420 if ( IS_PRIV(d) )
djm@1749 421 return 1;
djm@1749 422
kaf24@2336 423 if ( IS_CAPABLE_PHYSDEV(d) )
kaf24@2336 424 return domain_iomem_in_pfn(d, pfn);
djm@1749 425
djm@1749 426 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
djm@1749 427 return 0;
djm@1749 428 }
djm@1749 429
kaf24@2756 430 return ((l1v & _PAGE_RW) ?
kaf24@2756 431 get_page_and_type(page, d, PGT_writable_page) :
kaf24@2757 432 get_page(page, d));
djm@1749 433 }
djm@1749 434
djm@1749 435
djm@1749 436 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
kaf24@2314 437 static int
kaf24@2314 438 get_page_from_l2e(
kaf24@2466 439 l2_pgentry_t l2e, unsigned long pfn,
kaf24@2466 440 struct domain *d, unsigned long va_idx)
djm@1749 441 {
iap10@2458 442 int rc;
iap10@2458 443
djm@1749 444 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
djm@1749 445 return 1;
djm@1749 446
djm@1749 447 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
djm@1749 448 {
djm@1749 449 MEM_LOG("Bad L2 page type settings %04lx",
djm@1749 450 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
djm@1749 451 return 0;
djm@1749 452 }
djm@1749 453
iap10@2458 454 rc = get_page_and_type_from_pagenr(
iap10@2458 455 l2_pgentry_to_pagenr(l2e),
kaf24@2466 456 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
iap10@2458 457
iap10@2458 458 if ( unlikely(!rc) )
kaf24@2314 459 return get_linear_pagetable(l2e, pfn, d);
djm@1749 460
djm@1749 461 return 1;
djm@1749 462 }
djm@1749 463
djm@1749 464
kaf24@2382 465 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
djm@1749 466 {
djm@1749 467 unsigned long l1v = l1_pgentry_val(l1e);
kaf24@2385 468 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
kaf24@2385 469 struct pfn_info *page = &frame_table[pfn];
iap10@3424 470 struct domain *e;
djm@1749 471
kaf24@2385 472 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
djm@1749 473 return;
djm@1749 474
iap10@3424 475 e = page->u.inuse.domain;
kaf24@2382 476 if ( unlikely(e != d) )
kaf24@2382 477 {
kaf24@2382 478 /*
kaf24@2382 479 * Unmap a foreign page that may have been mapped via a grant table.
kaf24@2382 480 * Note that this can fail for a privileged domain that can map foreign
kaf24@2382 481 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
kaf24@2382 482 * counted via a grant entry and some counted directly in the page
kaf24@2382 483 * structure's reference count. Note that reference counts won't get
kaf24@2382 484 * dangerously confused as long as we always try to decrement the
kaf24@2382 485 * grant entry first. We may end up with a mismatch between which
kaf24@2382 486 * mappings and which unmappings are counted via the grant entry, but
kaf24@2382 487 * really it doesn't matter as privileged domains have carte blanche.
kaf24@2382 488 */
kaf24@2655 489 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
kaf24@2382 490 return;
kaf24@2382 491 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
kaf24@2382 492 }
kaf24@2382 493
djm@1749 494 if ( l1v & _PAGE_RW )
djm@1749 495 {
djm@1749 496 put_page_and_type(page);
djm@1749 497 }
djm@1749 498 else
djm@1749 499 {
djm@1749 500 /* We expect this is rare so we blow the entire shadow LDT. */
kaf24@1970 501 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
djm@1749 502 PGT_ldt_page)) &&
kaf24@1970 503 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
cl349@2957 504 invalidate_shadow_ldt(e->exec_domain[0]);
djm@1749 505 put_page(page);
djm@1749 506 }
djm@1749 507 }
djm@1749 508
djm@1749 509
djm@1749 510 /*
djm@1749 511 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
djm@1749 512 * Note also that this automatically deals correctly with linear p.t.'s.
djm@1749 513 */
djm@1749 514 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
djm@1749 515 {
djm@1749 516 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
djm@1749 517 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
djm@1749 518 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
djm@1749 519 }
djm@1749 520
djm@1749 521
djm@1749 522 static int alloc_l2_table(struct pfn_info *page)
djm@1749 523 {
kaf24@2314 524 struct domain *d = page->u.inuse.domain;
kaf24@2314 525 unsigned long page_nr = page_to_pfn(page);
kaf24@2314 526 l2_pgentry_t *pl2e;
kaf24@2314 527 int i;
djm@1749 528
djm@1749 529 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 530
kaf24@3392 531 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
iap10@2458 532 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
djm@1749 533 goto fail;
kaf24@3392 534
djm@1749 535 #if defined(__i386__)
djm@1749 536 /* Now we add our private high mappings. */
djm@1749 537 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
djm@1749 538 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
djm@1749 539 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
djm@1749 540 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
djm@1749 541 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
djm@1749 542 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
cl349@3036 543 mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) |
djm@1749 544 __PAGE_HYPERVISOR);
djm@1749 545 #endif
djm@1749 546
djm@1749 547 unmap_domain_mem(pl2e);
djm@1749 548 return 1;
djm@1749 549
djm@1749 550 fail:
djm@1749 551 while ( i-- > 0 )
djm@1749 552 put_page_from_l2e(pl2e[i], page_nr);
djm@1749 553
djm@1749 554 unmap_domain_mem(pl2e);
djm@1749 555 return 0;
djm@1749 556 }
djm@1749 557
djm@1749 558
djm@1749 559 static int alloc_l1_table(struct pfn_info *page)
djm@1749 560 {
kaf24@2314 561 struct domain *d = page->u.inuse.domain;
kaf24@2314 562 unsigned long page_nr = page_to_pfn(page);
kaf24@2314 563 l1_pgentry_t *pl1e;
kaf24@2314 564 int i;
djm@1749 565
djm@1749 566 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 567
djm@1749 568 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2314 569 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
djm@1749 570 goto fail;
djm@1749 571
djm@1749 572 unmap_domain_mem(pl1e);
djm@1749 573 return 1;
djm@1749 574
djm@1749 575 fail:
djm@1749 576 while ( i-- > 0 )
kaf24@2382 577 put_page_from_l1e(pl1e[i], d);
djm@1749 578
djm@1749 579 unmap_domain_mem(pl1e);
djm@1749 580 return 0;
djm@1749 581 }
djm@1749 582
djm@1749 583
djm@1749 584 static void free_l2_table(struct pfn_info *page)
djm@1749 585 {
djm@1749 586 unsigned long page_nr = page - frame_table;
djm@1749 587 l2_pgentry_t *pl2e;
djm@1749 588 int i;
djm@1749 589
djm@1749 590 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 591
djm@1749 592 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
djm@1749 593 put_page_from_l2e(pl2e[i], page_nr);
djm@1749 594
djm@1749 595 unmap_domain_mem(pl2e);
djm@1749 596 }
djm@1749 597
djm@1749 598
djm@1749 599 static void free_l1_table(struct pfn_info *page)
djm@1749 600 {
kaf24@2382 601 struct domain *d = page->u.inuse.domain;
djm@1749 602 unsigned long page_nr = page - frame_table;
djm@1749 603 l1_pgentry_t *pl1e;
djm@1749 604 int i;
djm@1749 605
djm@1749 606 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
djm@1749 607
djm@1749 608 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2382 609 put_page_from_l1e(pl1e[i], d);
djm@1749 610
djm@1749 611 unmap_domain_mem(pl1e);
djm@1749 612 }
djm@1749 613
djm@1749 614
djm@1749 615 static inline int update_l2e(l2_pgentry_t *pl2e,
djm@1749 616 l2_pgentry_t ol2e,
djm@1749 617 l2_pgentry_t nl2e)
djm@1749 618 {
djm@1749 619 unsigned long o = cmpxchg((unsigned long *)pl2e,
djm@1749 620 l2_pgentry_val(ol2e),
djm@1749 621 l2_pgentry_val(nl2e));
djm@1749 622 if ( o != l2_pgentry_val(ol2e) )
djm@1749 623 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
djm@1749 624 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
djm@1749 625 return (o == l2_pgentry_val(ol2e));
djm@1749 626 }
djm@1749 627
djm@1749 628
djm@1749 629 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
djm@1749 630 static int mod_l2_entry(l2_pgentry_t *pl2e,
djm@1749 631 l2_pgentry_t nl2e,
djm@1749 632 unsigned long pfn)
djm@1749 633 {
djm@1749 634 l2_pgentry_t ol2e;
djm@1749 635 unsigned long _ol2e;
djm@1749 636
djm@1749 637 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
djm@1749 638 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
djm@1749 639 {
djm@1749 640 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
djm@1749 641 return 0;
djm@1749 642 }
djm@1749 643
djm@1749 644 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
djm@1749 645 return 0;
djm@1749 646 ol2e = mk_l2_pgentry(_ol2e);
djm@1749 647
djm@1749 648 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
djm@1749 649 {
djm@1749 650 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
djm@1749 651 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
djm@1749 652 return update_l2e(pl2e, ol2e, nl2e);
djm@1749 653
cl349@2957 654 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
cl349@2491 655 ((unsigned long)pl2e &
kaf24@2466 656 ~PAGE_MASK) >> 2)) )
djm@1749 657 return 0;
cl349@1860 658
djm@1749 659 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
djm@1749 660 {
djm@1749 661 put_page_from_l2e(nl2e, pfn);
djm@1749 662 return 0;
djm@1749 663 }
djm@1749 664
djm@1749 665 put_page_from_l2e(ol2e, pfn);
djm@1749 666 return 1;
djm@1749 667 }
djm@1749 668
djm@1749 669 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
djm@1749 670 return 0;
djm@1749 671
djm@1749 672 put_page_from_l2e(ol2e, pfn);
djm@1749 673 return 1;
djm@1749 674 }
djm@1749 675
djm@1749 676
djm@1749 677 static inline int update_l1e(l1_pgentry_t *pl1e,
djm@1749 678 l1_pgentry_t ol1e,
djm@1749 679 l1_pgentry_t nl1e)
djm@1749 680 {
djm@1749 681 unsigned long o = l1_pgentry_val(ol1e);
djm@1749 682 unsigned long n = l1_pgentry_val(nl1e);
djm@1749 683
djm@1749 684 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
djm@1749 685 unlikely(o != l1_pgentry_val(ol1e)) )
djm@1749 686 {
djm@1749 687 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
djm@1749 688 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
djm@1749 689 return 0;
djm@1749 690 }
djm@1749 691
djm@1749 692 return 1;
djm@1749 693 }
djm@1749 694
djm@1749 695
djm@1749 696 /* Update the L1 entry at pl1e to new value nl1e. */
djm@1749 697 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
djm@1749 698 {
djm@1749 699 l1_pgentry_t ol1e;
djm@1749 700 unsigned long _ol1e;
cl349@2957 701 struct domain *d = current->domain;
djm@1749 702
djm@1749 703 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
djm@1749 704 {
djm@1749 705 MEM_LOG("Bad get_user\n");
djm@1749 706 return 0;
djm@1749 707 }
djm@1749 708
djm@1749 709 ol1e = mk_l1_pgentry(_ol1e);
djm@1749 710
djm@1749 711 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
djm@1749 712 {
djm@1749 713 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
djm@1749 714 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
djm@1749 715 return update_l1e(pl1e, ol1e, nl1e);
djm@1749 716
kaf24@2314 717 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
djm@1749 718 return 0;
djm@1749 719
djm@1749 720 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
djm@1749 721 {
kaf24@2382 722 put_page_from_l1e(nl1e, d);
djm@1749 723 return 0;
djm@1749 724 }
djm@1749 725
kaf24@2382 726 put_page_from_l1e(ol1e, d);
djm@1749 727 return 1;
djm@1749 728 }
djm@1749 729
djm@1749 730 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
djm@1749 731 return 0;
djm@1749 732
kaf24@2382 733 put_page_from_l1e(ol1e, d);
djm@1749 734 return 1;
djm@1749 735 }
djm@1749 736
djm@1749 737
djm@1749 738 int alloc_page_type(struct pfn_info *page, unsigned int type)
djm@1749 739 {
djm@1749 740 switch ( type )
djm@1749 741 {
djm@1749 742 case PGT_l1_page_table:
djm@1749 743 return alloc_l1_table(page);
djm@1749 744 case PGT_l2_page_table:
djm@1749 745 return alloc_l2_table(page);
djm@1749 746 case PGT_gdt_page:
djm@1749 747 case PGT_ldt_page:
djm@1749 748 return alloc_segdesc_page(page);
djm@1749 749 default:
cl349@2491 750 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
cl349@2491 751 type, page->u.inuse.type_info,
cl349@2491 752 page->count_info);
djm@1749 753 BUG();
djm@1749 754 }
djm@1749 755
djm@1749 756 return 0;
djm@1749 757 }
djm@1749 758
djm@1749 759
djm@1749 760 void free_page_type(struct pfn_info *page, unsigned int type)
djm@1749 761 {
kaf24@2314 762 struct domain *d = page->u.inuse.domain;
kaf24@2314 763
djm@1749 764 switch ( type )
djm@1749 765 {
djm@1749 766 case PGT_l1_page_table:
djm@1749 767 free_l1_table(page);
djm@1749 768 break;
djm@1749 769
djm@1749 770 case PGT_l2_page_table:
djm@1749 771 free_l2_table(page);
djm@1749 772 break;
djm@1749 773
djm@1749 774 default:
djm@1749 775 BUG();
djm@1749 776 }
kaf24@2314 777
cl349@2957 778 if ( unlikely(d->exec_domain[0]->mm.shadow_mode) &&
cl349@2957 779 (get_shadow_status(&d->exec_domain[0]->mm, page_to_pfn(page)) & PSH_shadowed) )
kaf24@2314 780 {
kaf24@2314 781 unshadow_table(page_to_pfn(page), type);
cl349@2957 782 put_shadow_status(&d->exec_domain[0]->mm);
kaf24@2314 783 }
djm@1749 784 }
djm@1749 785
djm@1749 786
kaf24@2498 787 void put_page_type(struct pfn_info *page)
kaf24@2498 788 {
kaf24@2498 789 u32 nx, x, y = page->u.inuse.type_info;
kaf24@2498 790
kaf24@2498 791 again:
kaf24@2498 792 do {
kaf24@2498 793 x = y;
kaf24@2498 794 nx = x - 1;
kaf24@2498 795
kaf24@2498 796 ASSERT((x & PGT_count_mask) != 0);
kaf24@2588 797
kaf24@2588 798 /*
kaf24@2588 799 * The page should always be validated while a reference is held. The
kaf24@2588 800 * exception is during domain destruction, when we forcibly invalidate
kaf24@2588 801 * page-table pages if we detect a referential loop.
kaf24@2588 802 * See domain.c:relinquish_list().
kaf24@2588 803 */
kaf24@2588 804 ASSERT((x & PGT_validated) ||
cl349@3036 805 test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
kaf24@2498 806
kaf24@2498 807 if ( unlikely((nx & PGT_count_mask) == 0) )
kaf24@2498 808 {
kaf24@2498 809 /* Record TLB information for flush later. Races are harmless. */
kaf24@2790 810 page->tlbflush_timestamp = tlbflush_current_time();
kaf24@2498 811
kaf24@2588 812 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
kaf24@2588 813 likely(nx & PGT_validated) )
kaf24@2498 814 {
kaf24@2498 815 /*
kaf24@2498 816 * Page-table pages must be unvalidated when count is zero. The
kaf24@2498 817 * 'free' is safe because the refcnt is non-zero and validated
kaf24@2498 818 * bit is clear => other ops will spin or fail.
kaf24@2498 819 */
kaf24@2498 820 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
kaf24@2498 821 x & ~PGT_validated)) != x) )
kaf24@2498 822 goto again;
kaf24@2498 823 /* We cleared the 'valid bit' so we do the clear up. */
kaf24@2498 824 free_page_type(page, x & PGT_type_mask);
kaf24@2498 825 /* Carry on, but with the 'valid bit' now clear. */
kaf24@2498 826 x &= ~PGT_validated;
kaf24@2498 827 nx &= ~PGT_validated;
kaf24@2498 828 }
kaf24@2498 829 }
cl349@2644 830 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
kaf24@2498 831 (PGT_pinned | 1)) )
cl349@2644 832 {
kaf24@2498 833 /* Page is now only pinned. Make the back pointer mutable again. */
cl349@2644 834 nx |= PGT_va_mutable;
cl349@2644 835 }
kaf24@2498 836 }
kaf24@2498 837 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
kaf24@2498 838 }
kaf24@2498 839
kaf24@2498 840
kaf24@2498 841 int get_page_type(struct pfn_info *page, u32 type)
kaf24@2498 842 {
kaf24@2498 843 u32 nx, x, y = page->u.inuse.type_info;
kaf24@2498 844
kaf24@2498 845 again:
kaf24@2498 846 do {
kaf24@2498 847 x = y;
kaf24@2498 848 nx = x + 1;
kaf24@2498 849 if ( unlikely((nx & PGT_count_mask) == 0) )
kaf24@2498 850 {
kaf24@2498 851 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
kaf24@2498 852 return 0;
kaf24@2498 853 }
kaf24@2498 854 else if ( unlikely((x & PGT_count_mask) == 0) )
kaf24@2498 855 {
kaf24@2498 856 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
kaf24@2498 857 {
kaf24@2498 858 /*
kaf24@2498 859 * On type change we check to flush stale TLB entries. This
kaf24@2498 860 * may be unnecessary (e.g., page was GDT/LDT) but those
kaf24@2498 861 * circumstances should be very rare.
kaf24@2498 862 */
kaf24@2498 863 struct domain *d = page->u.inuse.domain;
cl349@2957 864 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
kaf24@2498 865 page->tlbflush_timestamp)) )
kaf24@2498 866 {
kaf24@2498 867 perfc_incr(need_flush_tlb_flush);
cl349@2957 868 flush_tlb_cpu(d->exec_domain[0]->processor);
kaf24@2498 869 }
kaf24@2498 870
kaf24@2498 871 /* We lose existing type, back pointer, and validity. */
kaf24@2498 872 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
kaf24@2498 873 nx |= type;
kaf24@2498 874
kaf24@2498 875 /* No special validation needed for writable pages. */
kaf24@2498 876 /* Page tables and GDT/LDT need to be scanned for validity. */
kaf24@2498 877 if ( type == PGT_writable_page )
kaf24@2498 878 nx |= PGT_validated;
kaf24@2498 879 }
kaf24@2498 880 }
kaf24@2498 881 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
kaf24@2498 882 {
kaf24@2498 883 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
kaf24@2498 884 {
kaf24@2498 885 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
kaf24@2498 886 ((type & PGT_type_mask) != PGT_l1_page_table) )
kaf24@2498 887 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
kaf24@2498 888 x & PGT_type_mask, type, page_to_pfn(page));
kaf24@2498 889 return 0;
kaf24@2498 890 }
kaf24@2498 891 else if ( (x & PGT_va_mask) == PGT_va_mutable )
kaf24@2498 892 {
kaf24@2498 893 /* The va backpointer is mutable, hence we update it. */
kaf24@2498 894 nx &= ~PGT_va_mask;
kaf24@2498 895 nx |= type; /* we know the actual type is correct */
kaf24@2498 896 }
kaf24@2498 897 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
kaf24@2498 898 {
kaf24@2506 899 /* This table is potentially mapped at multiple locations. */
kaf24@2506 900 nx &= ~PGT_va_mask;
kaf24@2506 901 nx |= PGT_va_unknown;
kaf24@2498 902 }
kaf24@2498 903 }
cl349@2644 904 else if ( unlikely(!(x & PGT_validated)) )
kaf24@2498 905 {
kaf24@2498 906 /* Someone else is updating validation of this page. Wait... */
kaf24@2498 907 while ( (y = page->u.inuse.type_info) == x )
kaf24@2498 908 {
kaf24@2498 909 rep_nop();
kaf24@2498 910 barrier();
kaf24@2498 911 }
kaf24@2498 912 goto again;
kaf24@2498 913 }
kaf24@2498 914 }
kaf24@2498 915 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
kaf24@2498 916
kaf24@2498 917 if ( unlikely(!(nx & PGT_validated)) )
kaf24@2498 918 {
kaf24@2498 919 /* Try to validate page type; drop the new reference on failure. */
kaf24@2498 920 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
kaf24@2498 921 {
kaf24@2498 922 MEM_LOG("Error while validating pfn %08lx for type %08x."
kaf24@2498 923 " caf=%08x taf=%08x\n",
kaf24@2498 924 page_to_pfn(page), type,
cl349@2644 925 page->count_info,
cl349@2644 926 page->u.inuse.type_info);
kaf24@2498 927 /* Noone else can get a reference. We hold the only ref. */
kaf24@2498 928 page->u.inuse.type_info = 0;
kaf24@2498 929 return 0;
kaf24@2498 930 }
kaf24@2498 931
kaf24@2498 932 /* Noone else is updating simultaneously. */
kaf24@2498 933 __set_bit(_PGT_validated, &page->u.inuse.type_info);
kaf24@2498 934 }
kaf24@2498 935
kaf24@2498 936 return 1;
kaf24@2498 937 }
kaf24@2498 938
kaf24@2498 939
kaf24@3443 940 int new_guest_cr3(unsigned long pfn)
kaf24@3443 941 {
kaf24@3443 942 struct exec_domain *ed = current;
kaf24@3443 943 struct domain *d = ed->domain;
kaf24@3443 944 int okay, cpu = smp_processor_id();
kaf24@3443 945 unsigned long old_base_pfn;
kaf24@3443 946
kaf24@3443 947 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
kaf24@3443 948 if ( likely(okay) )
kaf24@3443 949 {
kaf24@3443 950 invalidate_shadow_ldt(ed);
kaf24@3443 951
kaf24@3443 952 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
kaf24@3443 953 old_base_pfn = pagetable_val(ed->mm.pagetable) >> PAGE_SHIFT;
kaf24@3443 954 ed->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
kaf24@3443 955
kaf24@3443 956 shadow_mk_pagetable(&ed->mm);
kaf24@3443 957
kaf24@3443 958 write_ptbase(&ed->mm);
kaf24@3443 959
kaf24@3443 960 put_page_and_type(&frame_table[old_base_pfn]);
kaf24@3443 961 }
kaf24@3443 962 else
kaf24@3443 963 {
kaf24@3517 964 MEM_LOG("Error while installing new baseptr %08lx", pfn);
kaf24@3443 965 }
kaf24@3443 966
kaf24@3443 967 return okay;
kaf24@3443 968 }
kaf24@3443 969
djm@1749 970 static int do_extended_command(unsigned long ptr, unsigned long val)
djm@1749 971 {
djm@1749 972 int okay = 1, cpu = smp_processor_id();
djm@1749 973 unsigned int cmd = val & MMUEXT_CMD_MASK;
djm@1749 974 unsigned long pfn = ptr >> PAGE_SHIFT;
djm@1749 975 struct pfn_info *page = &frame_table[pfn];
cl349@2957 976 struct exec_domain *ed = current;
cl349@2957 977 struct domain *d = ed->domain, *nd, *e;
djm@1749 978 u32 x, y;
djm@1749 979 domid_t domid;
kaf24@2385 980 grant_ref_t gntref;
djm@1749 981
djm@1749 982 switch ( cmd )
djm@1749 983 {
kaf24@2465 984 case MMUEXT_PIN_L1_TABLE:
kaf24@2465 985 case MMUEXT_PIN_L2_TABLE:
kaf24@2466 986 /*
kaf24@2466 987 * We insist that, if you pin an L1 page, it's the first thing that
kaf24@2466 988 * you do to it. This is because we require the backptr to still be
kaf24@2466 989 * mutable. This assumption seems safe.
kaf24@2466 990 */
djm@1749 991 okay = get_page_and_type_from_pagenr(
kaf24@2465 992 pfn,
kaf24@2465 993 ((cmd==MMUEXT_PIN_L2_TABLE) ?
cl349@2491 994 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
kaf24@2465 995 FOREIGNDOM);
iap10@2458 996
djm@1749 997 if ( unlikely(!okay) )
djm@1749 998 {
djm@1749 999 MEM_LOG("Error while pinning pfn %08lx", pfn);
djm@1749 1000 break;
djm@1749 1001 }
djm@1749 1002
kaf24@2466 1003 if ( unlikely(test_and_set_bit(_PGT_pinned,
kaf24@2466 1004 &page->u.inuse.type_info)) )
djm@1749 1005 {
djm@1749 1006 MEM_LOG("Pfn %08lx already pinned", pfn);
djm@1749 1007 put_page_and_type(page);
djm@1749 1008 okay = 0;
djm@1749 1009 break;
djm@1749 1010 }
djm@1749 1011
djm@1749 1012 break;
djm@1749 1013
djm@1749 1014 case MMUEXT_UNPIN_TABLE:
kaf24@2314 1015 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
djm@1749 1016 {
djm@1749 1017 MEM_LOG("Page %08lx bad domain (dom=%p)",
kaf24@1970 1018 ptr, page->u.inuse.domain);
djm@1749 1019 }
kaf24@2466 1020 else if ( likely(test_and_clear_bit(_PGT_pinned,
kaf24@2466 1021 &page->u.inuse.type_info)) )
djm@1749 1022 {
djm@1749 1023 put_page_and_type(page);
djm@1749 1024 put_page(page);
djm@1749 1025 }
djm@1749 1026 else
djm@1749 1027 {
djm@1749 1028 okay = 0;
djm@1749 1029 put_page(page);
djm@1749 1030 MEM_LOG("Pfn %08lx not pinned", pfn);
djm@1749 1031 }
djm@1749 1032 break;
djm@1749 1033
djm@1749 1034 case MMUEXT_NEW_BASEPTR:
kaf24@3443 1035 okay = new_guest_cr3(pfn);
djm@1749 1036 break;
djm@1749 1037
djm@1749 1038 case MMUEXT_TLB_FLUSH:
djm@1749 1039 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
djm@1749 1040 break;
djm@1749 1041
djm@1749 1042 case MMUEXT_INVLPG:
djm@1749 1043 __flush_tlb_one(ptr);
djm@1749 1044 break;
djm@1749 1045
kaf24@2463 1046 case MMUEXT_FLUSH_CACHE:
kaf24@2463 1047 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
kaf24@2463 1048 {
kaf24@2463 1049 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
kaf24@2463 1050 okay = 0;
kaf24@2463 1051 }
kaf24@2463 1052 else
kaf24@2463 1053 {
kaf24@2463 1054 wbinvd();
kaf24@2463 1055 }
kaf24@2463 1056 break;
kaf24@2463 1057
djm@1749 1058 case MMUEXT_SET_LDT:
djm@1749 1059 {
djm@1749 1060 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
djm@1749 1061 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
djm@1749 1062 (ents > 8192) ||
djm@1749 1063 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
djm@1749 1064 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
djm@1749 1065 {
djm@1749 1066 okay = 0;
djm@1749 1067 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
djm@1749 1068 }
cl349@2957 1069 else if ( (ed->mm.ldt_ents != ents) ||
cl349@2957 1070 (ed->mm.ldt_base != ptr) )
djm@1749 1071 {
cl349@2957 1072 invalidate_shadow_ldt(ed);
cl349@2957 1073 ed->mm.ldt_base = ptr;
cl349@2957 1074 ed->mm.ldt_ents = ents;
cl349@2957 1075 load_LDT(ed);
djm@1749 1076 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
djm@1749 1077 if ( ents != 0 )
djm@1749 1078 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
djm@1749 1079 }
djm@1749 1080 break;
djm@1749 1081 }
djm@1749 1082
kaf24@2314 1083 case MMUEXT_SET_FOREIGNDOM:
kaf24@2314 1084 domid = (domid_t)(val >> 16);
djm@1749 1085
kaf24@2362 1086 if ( (e = percpu_info[cpu].foreign) != NULL )
kaf24@2362 1087 put_domain(e);
kaf24@2362 1088 percpu_info[cpu].foreign = NULL;
kaf24@2362 1089
djm@1749 1090 if ( !IS_PRIV(d) )
djm@1749 1091 {
kaf24@2336 1092 switch ( domid )
kaf24@2336 1093 {
kaf24@2336 1094 case DOMID_IO:
kaf24@2362 1095 get_knownalive_domain(dom_io);
kaf24@2362 1096 percpu_info[cpu].foreign = dom_io;
kaf24@2336 1097 break;
kaf24@2336 1098 default:
kaf24@2748 1099 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
kaf24@2336 1100 okay = 0;
kaf24@2336 1101 break;
kaf24@2336 1102 }
djm@1749 1103 }
djm@1749 1104 else
djm@1749 1105 {
kaf24@2314 1106 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
kaf24@2314 1107 if ( e == NULL )
djm@1749 1108 {
kaf24@2336 1109 switch ( domid )
kaf24@2336 1110 {
kaf24@2336 1111 case DOMID_XEN:
kaf24@2362 1112 get_knownalive_domain(dom_xen);
kaf24@2362 1113 percpu_info[cpu].foreign = dom_xen;
kaf24@2336 1114 break;
kaf24@2336 1115 case DOMID_IO:
kaf24@2362 1116 get_knownalive_domain(dom_io);
kaf24@2362 1117 percpu_info[cpu].foreign = dom_io;
kaf24@2336 1118 break;
kaf24@2336 1119 default:
kaf24@2336 1120 MEM_LOG("Unknown domain '%u'", domid);
kaf24@2336 1121 okay = 0;
kaf24@2336 1122 break;
kaf24@2336 1123 }
djm@1749 1124 }
djm@1749 1125 }
djm@1749 1126 break;
djm@1749 1127
kaf24@2385 1128 case MMUEXT_TRANSFER_PAGE:
kaf24@2385 1129 domid = (domid_t)(val >> 16);
kaf24@2385 1130 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
kaf24@2385 1131
kaf24@2385 1132 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
kaf24@2385 1133 unlikely(!pfn_is_ram(pfn)) ||
kaf24@2385 1134 unlikely((e = find_domain_by_id(domid)) == NULL) )
kaf24@2385 1135 {
kaf24@2385 1136 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
kaf24@2385 1137 okay = 0;
kaf24@2385 1138 break;
kaf24@2385 1139 }
kaf24@2385 1140
kaf24@2385 1141 spin_lock(&d->page_alloc_lock);
kaf24@2385 1142
kaf24@2385 1143 /*
kaf24@2385 1144 * The tricky bit: atomically release ownership while there is just one
kaf24@2385 1145 * benign reference to the page (PGC_allocated). If that reference
kaf24@2385 1146 * disappears then the deallocation routine will safely spin.
kaf24@2385 1147 */
kaf24@2385 1148 nd = page->u.inuse.domain;
kaf24@2385 1149 y = page->count_info;
kaf24@2385 1150 do {
kaf24@2385 1151 x = y;
kaf24@2385 1152 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
kaf24@2385 1153 (1|PGC_allocated)) ||
kaf24@2385 1154 unlikely(nd != d) )
kaf24@2385 1155 {
kaf24@2385 1156 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
kaf24@2385 1157 " caf=%08x, taf=%08x\n", page_to_pfn(page),
kaf24@2748 1158 d, d->id, nd, x, page->u.inuse.type_info);
kaf24@2385 1159 spin_unlock(&d->page_alloc_lock);
kaf24@2385 1160 put_domain(e);
kaf24@2663 1161 return 0;
kaf24@2385 1162 }
kaf24@2385 1163 __asm__ __volatile__(
kaf24@2385 1164 LOCK_PREFIX "cmpxchg8b %2"
kaf24@2385 1165 : "=d" (nd), "=a" (y),
kaf24@2385 1166 "=m" (*(volatile u64 *)(&page->count_info))
kaf24@2385 1167 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
kaf24@2385 1168 }
kaf24@2385 1169 while ( unlikely(nd != d) || unlikely(y != x) );
kaf24@2385 1170
kaf24@2385 1171 /*
kaf24@2385 1172 * Unlink from 'd'. At least one reference remains (now anonymous), so
kaf24@2385 1173 * noone else is spinning to try to delete this page from 'd'.
kaf24@2385 1174 */
kaf24@2385 1175 d->tot_pages--;
kaf24@2385 1176 list_del(&page->list);
kaf24@2385 1177
kaf24@2385 1178 spin_unlock(&d->page_alloc_lock);
kaf24@2385 1179
kaf24@2385 1180 spin_lock(&e->page_alloc_lock);
kaf24@2385 1181
kaf24@2466 1182 /*
kaf24@2466 1183 * Check that 'e' will accept the page and has reservation headroom.
kaf24@2466 1184 * Also, a domain mustn't have PGC_allocated pages when it is dying.
kaf24@2466 1185 */
kaf24@2385 1186 ASSERT(e->tot_pages <= e->max_pages);
cl349@2957 1187 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
kaf24@2466 1188 unlikely(e->tot_pages == e->max_pages) ||
kaf24@2385 1189 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
kaf24@2385 1190 {
kaf24@2431 1191 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
kaf24@2469 1192 "provided a bad grant ref, or is dying (%08lx).\n",
cl349@2957 1193 e->tot_pages, e->max_pages, e->d_flags);
kaf24@2385 1194 spin_unlock(&e->page_alloc_lock);
kaf24@2385 1195 put_domain(e);
kaf24@2385 1196 okay = 0;
kaf24@2385 1197 break;
kaf24@2385 1198 }
kaf24@2385 1199
kaf24@2385 1200 /* Okay, add the page to 'e'. */
kaf24@2385 1201 if ( unlikely(e->tot_pages++ == 0) )
kaf24@2385 1202 get_knownalive_domain(e);
kaf24@2385 1203 list_add_tail(&page->list, &e->page_list);
kaf24@2385 1204 page->u.inuse.domain = e;
kaf24@2385 1205
kaf24@2385 1206 spin_unlock(&e->page_alloc_lock);
kaf24@2385 1207
kaf24@2385 1208 /* Transfer is all done: tell the guest about its new page frame. */
kaf24@2385 1209 gnttab_notify_transfer(e, gntref, pfn);
kaf24@2385 1210
kaf24@2385 1211 put_domain(e);
kaf24@2385 1212 break;
kaf24@2385 1213
djm@1749 1214 case MMUEXT_REASSIGN_PAGE:
djm@1749 1215 if ( unlikely(!IS_PRIV(d)) )
djm@1749 1216 {
kaf24@2748 1217 MEM_LOG("Dom %u has no reassignment priv", d->id);
djm@1749 1218 okay = 0;
djm@1749 1219 break;
djm@1749 1220 }
djm@1749 1221
kaf24@2314 1222 e = percpu_info[cpu].foreign;
kaf24@2314 1223 if ( unlikely(e == NULL) )
djm@1749 1224 {
kaf24@2314 1225 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
djm@1749 1226 okay = 0;
djm@1749 1227 break;
djm@1749 1228 }
djm@1749 1229
djm@1749 1230 /*
djm@1749 1231 * Grab both page_list locks, in order. This prevents the page from
djm@1749 1232 * disappearing elsewhere while we modify the owner, and we'll need
djm@1749 1233 * both locks if we're successful so that we can change lists.
djm@1749 1234 */
djm@1749 1235 if ( d < e )
djm@1749 1236 {
djm@1749 1237 spin_lock(&d->page_alloc_lock);
djm@1749 1238 spin_lock(&e->page_alloc_lock);
djm@1749 1239 }
djm@1749 1240 else
djm@1749 1241 {
djm@1749 1242 spin_lock(&e->page_alloc_lock);
djm@1749 1243 spin_lock(&d->page_alloc_lock);
djm@1749 1244 }
djm@1749 1245
djm@1749 1246 /* A domain shouldn't have PGC_allocated pages when it is dying. */
cl349@2957 1247 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
djm@1749 1248 unlikely(IS_XEN_HEAP_FRAME(page)) )
djm@1749 1249 {
kaf24@1871 1250 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
djm@1749 1251 okay = 0;
djm@1749 1252 goto reassign_fail;
djm@1749 1253 }
djm@1749 1254
djm@1749 1255 /*
djm@1749 1256 * The tricky bit: atomically change owner while there is just one
djm@1749 1257 * benign reference to the page (PGC_allocated). If that reference
djm@1749 1258 * disappears then the deallocation routine will safely spin.
djm@1749 1259 */
kaf24@1970 1260 nd = page->u.inuse.domain;
kaf24@2384 1261 y = page->count_info;
djm@1749 1262 do {
djm@1749 1263 x = y;
djm@1749 1264 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
djm@1749 1265 (1|PGC_allocated)) ||
djm@1749 1266 unlikely(nd != d) )
djm@1749 1267 {
djm@1749 1268 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
djm@1749 1269 " caf=%08x, taf=%08x\n", page_to_pfn(page),
kaf24@2748 1270 d, d->id, nd, x, page->u.inuse.type_info);
djm@1749 1271 okay = 0;
djm@1749 1272 goto reassign_fail;
djm@1749 1273 }
djm@1749 1274 __asm__ __volatile__(
djm@1749 1275 LOCK_PREFIX "cmpxchg8b %3"
kaf24@2384 1276 : "=d" (nd), "=a" (y), "=c" (e),
kaf24@2384 1277 "=m" (*(volatile u64 *)(&page->count_info))
kaf24@2384 1278 : "0" (d), "1" (x), "c" (e), "b" (x) );
djm@1749 1279 }
djm@1749 1280 while ( unlikely(nd != d) || unlikely(y != x) );
djm@1749 1281
djm@1749 1282 /*
djm@1749 1283 * Unlink from 'd'. We transferred at least one reference to 'e', so
djm@1749 1284 * noone else is spinning to try to delete this page from 'd'.
djm@1749 1285 */
djm@1749 1286 d->tot_pages--;
djm@1749 1287 list_del(&page->list);
djm@1749 1288
djm@1749 1289 /*
djm@1749 1290 * Add the page to 'e'. Someone may already have removed the last
djm@1749 1291 * reference and want to remove the page from 'e'. However, we have
djm@1749 1292 * the lock so they'll spin waiting for us.
djm@1749 1293 */
djm@1749 1294 if ( unlikely(e->tot_pages++ == 0) )
kaf24@2336 1295 get_knownalive_domain(e);
djm@1749 1296 list_add_tail(&page->list, &e->page_list);
djm@1749 1297
djm@1749 1298 reassign_fail:
djm@1749 1299 spin_unlock(&d->page_alloc_lock);
djm@1749 1300 spin_unlock(&e->page_alloc_lock);
djm@1749 1301 break;
djm@1749 1302
kaf24@2314 1303 case MMUEXT_CLEAR_FOREIGNDOM:
kaf24@2314 1304 if ( (e = percpu_info[cpu].foreign) != NULL )
kaf24@2314 1305 put_domain(e);
kaf24@2314 1306 percpu_info[cpu].foreign = NULL;
djm@1749 1307 break;
djm@1749 1308
djm@1749 1309 default:
djm@1749 1310 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
djm@1749 1311 okay = 0;
djm@1749 1312 break;
djm@1749 1313 }
djm@1749 1314
djm@1749 1315 return okay;
djm@1749 1316 }
djm@1749 1317
kaf24@3177 1318 int do_mmu_update(
kaf24@3177 1319 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
kaf24@3177 1320 {
kaf24@3177 1321 /*
kaf24@3177 1322 * We steal the m.s.b. of the @count parameter to indicate whether this
kaf24@3177 1323 * invocation of do_mmu_update() is resuming a previously preempted call.
kaf24@3187 1324 * We steal the next 15 bits to remember the current FOREIGNDOM.
kaf24@3177 1325 */
kaf24@3187 1326 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
kaf24@3187 1327 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
kaf24@3187 1328 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
djm@1749 1329
djm@1749 1330 mmu_update_t req;
djm@1749 1331 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
djm@1749 1332 struct pfn_info *page;
kaf24@3187 1333 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
kaf24@3177 1334 unsigned int cmd, done = 0;
djm@1749 1335 unsigned long prev_spfn = 0;
djm@1749 1336 l1_pgentry_t *prev_spl1e = 0;
cl349@2957 1337 struct exec_domain *ed = current;
cl349@2957 1338 struct domain *d = ed->domain;
kaf24@2466 1339 u32 type_info;
kaf24@3187 1340 domid_t domid;
djm@1749 1341
cl349@3036 1342 LOCK_BIGLOCK(d);
cl349@3036 1343
kaf24@3517 1344 cleanup_writable_pagetable(d);
kaf24@2375 1345
kaf24@3177 1346 /*
kaf24@3177 1347 * If we are resuming after preemption, read how much work we have already
kaf24@3177 1348 * done. This allows us to set the @done output parameter correctly.
kaf24@3187 1349 * We also reset FOREIGNDOM here.
kaf24@3177 1350 */
kaf24@3187 1351 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
kaf24@3177 1352 {
kaf24@3187 1353 if ( !(count & MMU_UPDATE_PREEMPTED) )
kaf24@3187 1354 {
kaf24@3187 1355 /* Count overflow into private FOREIGNDOM field. */
kaf24@3187 1356 MEM_LOG("do_mmu_update count is too large");
kaf24@3187 1357 rc = -EINVAL;
kaf24@3187 1358 goto out;
kaf24@3187 1359 }
kaf24@3177 1360 count &= ~MMU_UPDATE_PREEMPTED;
kaf24@3187 1361 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
kaf24@3187 1362 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
kaf24@3177 1363 if ( unlikely(pdone != NULL) )
kaf24@3177 1364 (void)get_user(done, pdone);
cl349@3193 1365 if ( (domid != current->domain->id) &&
kaf24@3187 1366 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
kaf24@3187 1367 {
kaf24@3187 1368 rc = -EINVAL;
kaf24@3187 1369 goto out;
kaf24@3187 1370 }
kaf24@3177 1371 }
kaf24@3177 1372
kaf24@3269 1373 perfc_incrc(calls_to_mmu_update);
kaf24@3269 1374 perfc_addc(num_page_updates, count);
kaf24@3269 1375
kaf24@3177 1376 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
kaf24@3187 1377 {
kaf24@3187 1378 rc = -EFAULT;
kaf24@3187 1379 goto out;
kaf24@3187 1380 }
cl349@1860 1381
djm@1749 1382 for ( i = 0; i < count; i++ )
djm@1749 1383 {
kaf24@3177 1384 if ( hypercall_preempt_check() )
kaf24@3177 1385 {
kaf24@3187 1386 rc = hypercall_create_continuation(
kaf24@3177 1387 __HYPERVISOR_mmu_update, 3, ureqs,
kaf24@3187 1388 (count - i) |
kaf24@3187 1389 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
kaf24@3187 1390 MMU_UPDATE_PREEMPTED, pdone);
kaf24@3177 1391 break;
kaf24@3177 1392 }
kaf24@3129 1393
kaf24@2375 1394 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
djm@1749 1395 {
kaf24@2375 1396 MEM_LOG("Bad __copy_from_user");
djm@1749 1397 rc = -EFAULT;
djm@1749 1398 break;
djm@1749 1399 }
djm@1749 1400
djm@1749 1401 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
djm@1749 1402 pfn = req.ptr >> PAGE_SHIFT;
djm@1749 1403
djm@1749 1404 okay = 0;
djm@1749 1405
djm@1749 1406 switch ( cmd )
djm@1749 1407 {
djm@1749 1408 /*
djm@1749 1409 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
djm@1749 1410 */
djm@1749 1411 case MMU_NORMAL_PT_UPDATE:
cl349@2957 1412 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
djm@1749 1413 {
djm@1749 1414 MEM_LOG("Could not get page for normal update");
djm@1749 1415 break;
djm@1749 1416 }
djm@1749 1417
djm@1749 1418 if ( likely(prev_pfn == pfn) )
djm@1749 1419 {
djm@1749 1420 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
djm@1749 1421 }
djm@1749 1422 else
djm@1749 1423 {
djm@1749 1424 if ( prev_pfn != 0 )
djm@1749 1425 unmap_domain_mem((void *)va);
djm@1749 1426 va = (unsigned long)map_domain_mem(req.ptr);
djm@1749 1427 prev_pfn = pfn;
djm@1749 1428 }
djm@1749 1429
djm@1749 1430 page = &frame_table[pfn];
kaf24@2466 1431 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
djm@1749 1432 {
djm@1749 1433 case PGT_l1_page_table:
kaf24@2466 1434 if ( likely(get_page_type(
kaf24@2466 1435 page, type_info & (PGT_type_mask|PGT_va_mask))) )
djm@1749 1436 {
djm@1749 1437 okay = mod_l1_entry((l1_pgentry_t *)va,
djm@1749 1438 mk_l1_pgentry(req.val));
djm@1749 1439
cl349@2957 1440 if ( unlikely(ed->mm.shadow_mode) && okay &&
cl349@2957 1441 (get_shadow_status(&ed->mm, page-frame_table) &
djm@1749 1442 PSH_shadowed) )
djm@1749 1443 {
kaf24@2375 1444 shadow_l1_normal_pt_update(
kaf24@2375 1445 req.ptr, req.val, &prev_spfn, &prev_spl1e);
cl349@2957 1446 put_shadow_status(&ed->mm);
djm@1749 1447 }
djm@1749 1448
djm@1749 1449 put_page_type(page);
djm@1749 1450 }
djm@1749 1451 break;
djm@1749 1452 case PGT_l2_page_table:
djm@1749 1453 if ( likely(get_page_type(page, PGT_l2_page_table)) )
djm@1749 1454 {
djm@1749 1455 okay = mod_l2_entry((l2_pgentry_t *)va,
djm@1749 1456 mk_l2_pgentry(req.val),
djm@1749 1457 pfn);
djm@1749 1458
cl349@2957 1459 if ( unlikely(ed->mm.shadow_mode) && okay &&
cl349@2957 1460 (get_shadow_status(&ed->mm, page-frame_table) &
djm@1749 1461 PSH_shadowed) )
djm@1749 1462 {
kaf24@2375 1463 shadow_l2_normal_pt_update(req.ptr, req.val);
cl349@2957 1464 put_shadow_status(&ed->mm);
djm@1749 1465 }
djm@1749 1466
djm@1749 1467 put_page_type(page);
djm@1749 1468 }
djm@1749 1469 break;
djm@1749 1470 default:
kaf24@2375 1471 if ( likely(get_page_type(page, PGT_writable_page)) )
djm@1749 1472 {
djm@1749 1473 *(unsigned long *)va = req.val;
djm@1749 1474 okay = 1;
djm@1749 1475 put_page_type(page);
djm@1749 1476 }
djm@1749 1477 break;
djm@1749 1478 }
djm@1749 1479
djm@1749 1480 put_page(page);
djm@1749 1481 break;
djm@1749 1482
djm@1749 1483 case MMU_MACHPHYS_UPDATE:
kaf24@2314 1484 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
djm@1749 1485 {
djm@1749 1486 MEM_LOG("Could not get page for mach->phys update");
djm@1749 1487 break;
djm@1749 1488 }
djm@1749 1489
djm@1749 1490 machine_to_phys_mapping[pfn] = req.val;
djm@1749 1491 okay = 1;
djm@1749 1492
djm@1749 1493 /*
djm@1749 1494 * If in log-dirty mode, mark the corresponding pseudo-physical
djm@1749 1495 * page as dirty.
djm@1749 1496 */
cl349@2957 1497 if ( unlikely(ed->mm.shadow_mode == SHM_logdirty) &&
cl349@2957 1498 mark_dirty(&ed->mm, pfn) )
cl349@2957 1499 ed->mm.shadow_dirty_block_count++;
djm@1749 1500
djm@1749 1501 put_page(&frame_table[pfn]);
djm@1749 1502 break;
djm@1749 1503
djm@1749 1504 /*
djm@1749 1505 * MMU_EXTENDED_COMMAND: Extended command is specified
djm@1749 1506 * in the least-siginificant bits of the 'value' field.
djm@1749 1507 */
djm@1749 1508 case MMU_EXTENDED_COMMAND:
djm@1749 1509 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
djm@1749 1510 okay = do_extended_command(req.ptr, req.val);
djm@1749 1511 break;
djm@1749 1512
djm@1749 1513 default:
djm@1749 1514 MEM_LOG("Invalid page update command %08lx", req.ptr);
djm@1749 1515 break;
djm@1749 1516 }
djm@1749 1517
djm@1749 1518 if ( unlikely(!okay) )
djm@1749 1519 {
djm@1749 1520 rc = -EINVAL;
djm@1749 1521 break;
djm@1749 1522 }
djm@1749 1523
djm@1749 1524 ureqs++;
djm@1749 1525 }
djm@1749 1526
kaf24@3187 1527 out:
djm@1749 1528 if ( prev_pfn != 0 )
djm@1749 1529 unmap_domain_mem((void *)va);
djm@1749 1530
kaf24@2375 1531 if ( unlikely(prev_spl1e != 0) )
djm@1749 1532 unmap_domain_mem((void *)prev_spl1e);
djm@1749 1533
djm@1749 1534 deferred_ops = percpu_info[cpu].deferred_ops;
djm@1749 1535 percpu_info[cpu].deferred_ops = 0;
djm@1749 1536
djm@1749 1537 if ( deferred_ops & DOP_FLUSH_TLB )
djm@1749 1538 local_flush_tlb();
kaf24@2375 1539
djm@1749 1540 if ( deferred_ops & DOP_RELOAD_LDT )
djm@1749 1541 (void)map_ldt_shadow_page(0);
djm@1749 1542
kaf24@2314 1543 if ( unlikely(percpu_info[cpu].foreign != NULL) )
djm@1749 1544 {
kaf24@2314 1545 put_domain(percpu_info[cpu].foreign);
kaf24@2314 1546 percpu_info[cpu].foreign = NULL;
djm@1749 1547 }
djm@1749 1548
kaf24@3177 1549 /* Add incremental work we have done to the @done output parameter. */
kaf24@3177 1550 if ( unlikely(pdone != NULL) )
kaf24@3177 1551 __put_user(done + i, pdone);
djm@1749 1552
cl349@3036 1553 UNLOCK_BIGLOCK(d);
djm@1749 1554 return rc;
djm@1749 1555 }
djm@1749 1556
djm@1749 1557
djm@1749 1558 int do_update_va_mapping(unsigned long page_nr,
djm@1749 1559 unsigned long val,
djm@1749 1560 unsigned long flags)
djm@1749 1561 {
cl349@2957 1562 struct exec_domain *ed = current;
cl349@2957 1563 struct domain *d = ed->domain;
djm@1749 1564 int err = 0;
cl349@2957 1565 unsigned int cpu = ed->processor;
djm@1749 1566 unsigned long deferred_ops;
djm@1749 1567
djm@1749 1568 perfc_incrc(calls_to_update_va);
djm@1749 1569
djm@1749 1570 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
djm@1749 1571 return -EINVAL;
djm@1749 1572
cl349@3036 1573 LOCK_BIGLOCK(d);
cl349@3036 1574
kaf24@3517 1575 cleanup_writable_pagetable(d);
cl349@1879 1576
djm@1749 1577 /*
djm@1749 1578 * XXX When we make this support 4MB superpages we should also deal with
djm@1749 1579 * the case of updating L2 entries.
djm@1749 1580 */
djm@1749 1581
djm@1749 1582 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
djm@1749 1583 mk_l1_pgentry(val))) )
djm@1749 1584 err = -EINVAL;
djm@1749 1585
cl349@2957 1586 if ( unlikely(ed->mm.shadow_mode) )
djm@1749 1587 {
djm@1749 1588 unsigned long sval;
djm@1749 1589
cl349@2957 1590 l1pte_propagate_from_guest(&ed->mm, &val, &sval);
djm@1749 1591
djm@1749 1592 if ( unlikely(__put_user(sval, ((unsigned long *)(
djm@1749 1593 &shadow_linear_pg_table[page_nr])))) )
djm@1749 1594 {
djm@1749 1595 /*
djm@1749 1596 * Since L2's are guranteed RW, failure indicates the page was not
djm@1749 1597 * shadowed, so ignore.
djm@1749 1598 */
djm@1749 1599 perfc_incrc(shadow_update_va_fail);
djm@1749 1600 }
djm@1749 1601
djm@1749 1602 /*
djm@1749 1603 * If we're in log-dirty mode then we need to note that we've updated
djm@1749 1604 * the PTE in the PT-holding page. We need the machine frame number
djm@1749 1605 * for this.
djm@1749 1606 */
cl349@2957 1607 if ( ed->mm.shadow_mode == SHM_logdirty )
kaf24@2673 1608 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
djm@1749 1609
cl349@2957 1610 check_pagetable(&ed->mm, ed->mm.pagetable, "va"); /* debug */
djm@1749 1611 }
djm@1749 1612
djm@1749 1613 deferred_ops = percpu_info[cpu].deferred_ops;
djm@1749 1614 percpu_info[cpu].deferred_ops = 0;
djm@1749 1615
djm@1749 1616 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
djm@1749 1617 unlikely(flags & UVMF_FLUSH_TLB) )
djm@1749 1618 local_flush_tlb();
djm@1749 1619 else if ( unlikely(flags & UVMF_INVLPG) )
djm@1749 1620 __flush_tlb_one(page_nr << PAGE_SHIFT);
djm@1749 1621
djm@1749 1622 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
djm@1749 1623 (void)map_ldt_shadow_page(0);
djm@1749 1624
cl349@3036 1625 UNLOCK_BIGLOCK(d);
cl349@3036 1626
djm@1749 1627 return err;
djm@1749 1628 }
djm@1749 1629
djm@1749 1630 int do_update_va_mapping_otherdomain(unsigned long page_nr,
djm@1749 1631 unsigned long val,
djm@1749 1632 unsigned long flags,
djm@1749 1633 domid_t domid)
djm@1749 1634 {
djm@1749 1635 unsigned int cpu = smp_processor_id();
djm@1749 1636 struct domain *d;
djm@1749 1637 int rc;
djm@1749 1638
cl349@2957 1639 if ( unlikely(!IS_PRIV(current->domain)) )
djm@1749 1640 return -EPERM;
djm@1749 1641
kaf24@2314 1642 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
djm@1749 1643 if ( unlikely(d == NULL) )
djm@1749 1644 {
djm@1749 1645 MEM_LOG("Unknown domain '%u'", domid);
djm@1749 1646 return -ESRCH;
djm@1749 1647 }
djm@1749 1648
djm@1749 1649 rc = do_update_va_mapping(page_nr, val, flags);
djm@1749 1650
djm@1749 1651 put_domain(d);
kaf24@2314 1652 percpu_info[cpu].foreign = NULL;
djm@1749 1653
djm@1749 1654 return rc;
djm@1749 1655 }
cl349@1879 1656
cl349@1879 1657
cl349@1921 1658
kaf24@2382 1659 /*************************
kaf24@2382 1660 * Writable Pagetables
kaf24@2382 1661 */
cl349@2093 1662
kaf24@2663 1663 ptwr_info_t ptwr_info[NR_CPUS];
cl349@1894 1664
kaf24@2097 1665 #ifdef VERBOSE
cl349@2496 1666 int ptwr_debug = 0x0;
kaf24@2654 1667 #define PTWR_PRINTK(_f, _a...) \
kaf24@2654 1668 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
cl349@2652 1669 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
cl349@2093 1670 #else
kaf24@2654 1671 #define PTWR_PRINTK(_f, _a...) ((void)0)
cl349@2093 1672 #endif
cl349@1879 1673
kaf24@2663 1674 /* Flush the given writable p.t. page and write-protect it again. */
cl349@2512 1675 void ptwr_flush(const int which)
cl349@1879 1676 {
kaf24@2663 1677 unsigned long sstat, spte, pte, *ptep, l1va;
kaf24@2663 1678 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
kaf24@3022 1679 l2_pgentry_t *pl2e;
kaf24@2663 1680 int i, cpu = smp_processor_id();
cl349@2957 1681 struct exec_domain *ed = current;
cl349@2957 1682 struct domain *d = ed->domain;
cl349@1879 1683
iap10@2640 1684 l1va = ptwr_info[cpu].ptinfo[which].l1va;
cl349@2644 1685 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
cl349@1913 1686
kaf24@2663 1687 /*
kaf24@2663 1688 * STEP 1. Write-protect the p.t. page so no more updates can occur.
kaf24@2663 1689 */
kaf24@2663 1690
kaf24@2663 1691 if ( unlikely(__get_user(pte, ptep)) )
kaf24@2663 1692 {
cl349@2512 1693 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
kaf24@2707 1694 /*
kaf24@2707 1695 * Really a bug. We could read this PTE during the initial fault,
kaf24@2841 1696 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
kaf24@2707 1697 */
kaf24@2707 1698 BUG();
cl349@2414 1699 }
kaf24@2654 1700 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
kaf24@2654 1701 PTWR_PRINT_WHICH, ptep, pte);
cl349@2631 1702 pte &= ~_PAGE_RW;
iap10@2640 1703
cl349@2957 1704 if ( unlikely(ed->mm.shadow_mode) )
kaf24@2663 1705 {
kaf24@2663 1706 /* Write-protect the p.t. page in the shadow page table. */
cl349@2957 1707 l1pte_propagate_from_guest(&ed->mm, &pte, &spte);
kaf24@2663 1708 __put_user(
kaf24@2663 1709 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
kaf24@2663 1710
kaf24@2663 1711 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
cl349@2957 1712 sstat = get_shadow_status(&ed->mm, pte >> PAGE_SHIFT);
kaf24@2663 1713 if ( sstat & PSH_shadowed )
kaf24@2663 1714 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
iap10@2640 1715 }
iap10@2640 1716
kaf24@2663 1717 /* Write-protect the p.t. page in the guest page table. */
kaf24@2663 1718 if ( unlikely(__put_user(pte, ptep)) )
kaf24@2663 1719 {
cl349@2512 1720 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
kaf24@2707 1721 /*
kaf24@2707 1722 * Really a bug. We could write this PTE during the initial fault,
kaf24@2841 1723 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
kaf24@2707 1724 */
kaf24@2707 1725 BUG();
cl349@2414 1726 }
kaf24@2663 1727
kaf24@2663 1728 /* Ensure that there are no stale writable mappings in any TLB. */
kaf24@2841 1729 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
cl349@3325 1730 #if 1
kaf24@2841 1731 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
cl349@3036 1732 #else
cl349@3036 1733 flush_tlb_all();
cl349@3036 1734 #endif
kaf24@2654 1735 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
kaf24@2654 1736 PTWR_PRINT_WHICH, ptep, pte);
cl349@2631 1737
kaf24@2663 1738 /*
kaf24@2663 1739 * STEP 2. Validate any modified PTEs.
kaf24@2663 1740 */
kaf24@2663 1741
cl349@2631 1742 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
kaf24@2663 1743 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2663 1744 {
cl349@2631 1745 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
cl349@2631 1746 nl1e = pl1e[i];
kaf24@2663 1747
kaf24@2663 1748 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
cl349@2631 1749 continue;
kaf24@2663 1750
kaf24@2663 1751 /*
kaf24@2663 1752 * Fast path for PTEs that have merely been write-protected
kaf24@2663 1753 * (e.g., during a Unix fork()). A strict reduction in privilege.
kaf24@2663 1754 */
kaf24@2663 1755 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
kaf24@2663 1756 {
kaf24@2663 1757 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
kaf24@2663 1758 {
kaf24@2663 1759 if ( unlikely(sl1e != NULL) )
kaf24@2673 1760 l1pte_propagate_from_guest(
cl349@2957 1761 &ed->mm, &l1_pgentry_val(nl1e),
kaf24@2663 1762 &l1_pgentry_val(sl1e[i]));
kaf24@2663 1763 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
kaf24@2663 1764 }
cl349@2644 1765 continue;
kaf24@2663 1766 }
kaf24@2663 1767
kaf24@2663 1768 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
kaf24@2663 1769 {
cl349@2631 1770 MEM_LOG("ptwr: Could not re-validate l1 page\n");
kaf24@2707 1771 /*
kaf24@2707 1772 * Make the remaining p.t's consistent before crashing, so the
kaf24@2707 1773 * reference counts are correct.
kaf24@2707 1774 */
kaf24@2707 1775 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
kaf24@2707 1776 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
cl349@2708 1777 unmap_domain_mem(pl1e);
cl349@2708 1778 ptwr_info[cpu].ptinfo[which].l1va = 0;
cl349@3036 1779 UNLOCK_BIGLOCK(d);
cl349@2631 1780 domain_crash();
cl349@2631 1781 }
kaf24@2663 1782
kaf24@2663 1783 if ( unlikely(sl1e != NULL) )
kaf24@2673 1784 l1pte_propagate_from_guest(
cl349@2957 1785 &ed->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
kaf24@2663 1786
kaf24@2663 1787 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
kaf24@2663 1788 put_page_from_l1e(ol1e, d);
cl349@2631 1789 }
cl349@2631 1790 unmap_domain_mem(pl1e);
cl349@2631 1791
kaf24@2663 1792 /*
kaf24@2663 1793 * STEP 3. Reattach the L1 p.t. page into the current address space.
kaf24@2663 1794 */
kaf24@2663 1795
cl349@2957 1796 if ( (which == PTWR_PT_ACTIVE) && likely(!ed->mm.shadow_mode) )
kaf24@2663 1797 {
kaf24@2663 1798 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
kaf24@3022 1799 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
cl349@2631 1800 }
iap10@2509 1801
kaf24@2663 1802 /*
kaf24@2663 1803 * STEP 4. Final tidy-up.
kaf24@2663 1804 */
iap10@2509 1805
cl349@2512 1806 ptwr_info[cpu].ptinfo[which].l1va = 0;
kaf24@2663 1807
kaf24@2663 1808 if ( unlikely(sl1e != NULL) )
kaf24@2663 1809 {
kaf24@2663 1810 unmap_domain_mem(sl1e);
cl349@2957 1811 put_shadow_status(&ed->mm);
kaf24@2663 1812 }
cl349@1879 1813 }
cl349@1879 1814
kaf24@2663 1815 /* Write page fault handler: check if guest is trying to modify a PTE. */
cl349@1879 1816 int ptwr_do_page_fault(unsigned long addr)
cl349@1879 1817 {
kaf24@3022 1818 unsigned long pte, pfn, l2e;
cl349@1879 1819 struct pfn_info *page;
kaf24@3022 1820 l2_pgentry_t *pl2e;
kaf24@2663 1821 int which, cpu = smp_processor_id();
kaf24@2663 1822 u32 l2_idx;
iap10@2458 1823
kaf24@2663 1824 /*
kaf24@2663 1825 * Attempt to read the PTE that maps the VA being accessed. By checking for
kaf24@2663 1826 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
kaf24@2663 1827 */
kaf24@2663 1828 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
kaf24@2663 1829 _PAGE_PRESENT) ||
kaf24@2663 1830 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
cl349@3036 1831 {
kaf24@2663 1832 return 0;
cl349@3036 1833 }
iap10@2509 1834
kaf24@2663 1835 pfn = pte >> PAGE_SHIFT;
kaf24@2663 1836 page = &frame_table[pfn];
cl349@1915 1837
kaf24@2663 1838 /* We are looking only for read-only mappings of p.t. pages. */
kaf24@2663 1839 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
kaf24@2663 1840 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
cl349@3036 1841 {
kaf24@2663 1842 return 0;
cl349@3036 1843 }
kaf24@2663 1844
kaf24@2663 1845 /* Get the L2 index at which this L1 p.t. is always mapped. */
kaf24@2663 1846 l2_idx = page->u.inuse.type_info & PGT_va_mask;
kaf24@2663 1847 if ( unlikely(l2_idx >= PGT_va_unknown) )
cl349@3036 1848 {
kaf24@2663 1849 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
cl349@3036 1850 }
kaf24@2663 1851 l2_idx >>= PGT_va_shift;
kaf24@3022 1852
kaf24@3022 1853 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
kaf24@3022 1854 {
kaf24@3022 1855 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
kaf24@3022 1856 domain_crash();
kaf24@3022 1857 }
kaf24@3022 1858
kaf24@2663 1859 /*
kaf24@2663 1860 * Is the L1 p.t. mapped into the current address space? If so we call it
kaf24@2663 1861 * an ACTIVE p.t., otherwise it is INACTIVE.
kaf24@2663 1862 */
kaf24@2663 1863 pl2e = &linear_l2_table[l2_idx];
kaf24@3022 1864 l2e = l2_pgentry_val(*pl2e);
kaf24@3022 1865 which = PTWR_PT_INACTIVE;
kaf24@3022 1866 if ( (l2e >> PAGE_SHIFT) == pfn )
kaf24@3022 1867 {
cl349@3179 1868 /* Check the PRESENT bit to set ACTIVE. */
kaf24@3022 1869 if ( likely(l2e & _PAGE_PRESENT) )
kaf24@3022 1870 which = PTWR_PT_ACTIVE;
cl349@3179 1871 else {
cl349@3179 1872 /*
cl349@3179 1873 * If the PRESENT bit is clear, we may be conflicting with
cl349@3179 1874 * the current ACTIVE p.t. (it may be the same p.t. mapped
cl349@3179 1875 * at another virt addr).
cl349@3179 1876 * The ptwr_flush call below will restore the PRESENT bit.
cl349@3179 1877 */
cl349@3179 1878 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
cl349@3179 1879 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
cl349@3179 1880 which = PTWR_PT_ACTIVE;
cl349@3179 1881 }
kaf24@3022 1882 }
kaf24@2663 1883
kaf24@2663 1884 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
kaf24@2663 1885 "pfn %08lx\n", PTWR_PRINT_WHICH,
kaf24@2663 1886 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
kaf24@2663 1887
kaf24@2663 1888 /*
kaf24@2663 1889 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
kaf24@2663 1890 * time. If there is already one, we must flush it out.
kaf24@2663 1891 */
kaf24@2663 1892 if ( ptwr_info[cpu].ptinfo[which].l1va )
kaf24@2663 1893 ptwr_flush(which);
iap10@2507 1894
kaf24@2663 1895 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
kaf24@2663 1896 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
kaf24@2663 1897
kaf24@2663 1898 /* For safety, disconnect the L1 p.t. page from current space. */
kaf24@2663 1899 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
kaf24@2663 1900 {
kaf24@3022 1901 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
cl349@3325 1902 #if 1
kaf24@2841 1903 flush_tlb(); /* XXX Multi-CPU guests? */
cl349@3036 1904 #else
cl349@3036 1905 flush_tlb_all();
cl349@3036 1906 #endif
cl349@1879 1907 }
kaf24@2663 1908
kaf24@2663 1909 /* Temporarily map the L1 page, and make a copy of it. */
kaf24@2663 1910 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
kaf24@2663 1911 memcpy(ptwr_info[cpu].ptinfo[which].page,
kaf24@2663 1912 ptwr_info[cpu].ptinfo[which].pl1e,
kaf24@2663 1913 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
kaf24@2663 1914
kaf24@2663 1915 /* Finally, make the p.t. page writable by the guest OS. */
kaf24@2663 1916 pte |= _PAGE_RW;
kaf24@2663 1917 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
kaf24@2663 1918 &linear_pg_table[addr>>PAGE_SHIFT], pte);
kaf24@2663 1919 if ( unlikely(__put_user(pte, (unsigned long *)
kaf24@2663 1920 &linear_pg_table[addr>>PAGE_SHIFT])) )
kaf24@2663 1921 {
kaf24@2663 1922 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
kaf24@2663 1923 &linear_pg_table[addr>>PAGE_SHIFT]);
kaf24@2707 1924 /* Toss the writable pagetable state and crash. */
kaf24@2707 1925 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
kaf24@2707 1926 ptwr_info[cpu].ptinfo[which].l1va = 0;
kaf24@2663 1927 domain_crash();
kaf24@2663 1928 }
kaf24@2663 1929
kaf24@3090 1930 return EXCRET_fault_fixed;
cl349@1879 1931 }
cl349@1894 1932
kaf24@2504 1933 static __init int ptwr_init(void)
kaf24@2504 1934 {
kaf24@2504 1935 int i;
kaf24@2504 1936
kaf24@2504 1937 for ( i = 0; i < smp_num_cpus; i++ )
kaf24@2504 1938 {
cl349@2512 1939 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
cl349@2512 1940 (void *)alloc_xenheap_page();
cl349@2512 1941 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
cl349@2512 1942 (void *)alloc_xenheap_page();
kaf24@2504 1943 }
kaf24@2504 1944
kaf24@2504 1945 return 0;
kaf24@2504 1946 }
kaf24@2504 1947 __initcall(ptwr_init);
kaf24@2504 1948
kaf24@2663 1949
kaf24@2663 1950
kaf24@2663 1951
kaf24@2663 1952 /************************************************************************/
kaf24@2663 1953 /************************************************************************/
kaf24@2663 1954 /************************************************************************/
kaf24@2663 1955
cl349@2092 1956 #ifndef NDEBUG
kaf24@2663 1957
cl349@1894 1958 void ptwr_status(void)
cl349@1894 1959 {
cl349@2512 1960 unsigned long pte, *ptep, pfn;
cl349@1894 1961 struct pfn_info *page;
cl349@1894 1962 int cpu = smp_processor_id();
cl349@1894 1963
cl349@2512 1964 ptep = (unsigned long *)&linear_pg_table
cl349@2512 1965 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
kaf24@2237 1966
cl349@2512 1967 if ( __get_user(pte, ptep) ) {
cl349@2512 1968 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
cl349@2495 1969 domain_crash();
cl349@1894 1970 }
cl349@1894 1971
cl349@2495 1972 pfn = pte >> PAGE_SHIFT;
cl349@2495 1973 page = &frame_table[pfn];
cl349@2495 1974 printk("need to alloc l1 page %p\n", page);
cl349@2495 1975 /* make pt page writable */
cl349@2495 1976 printk("need to make read-only l1-page at %p is %08lx\n",
cl349@2512 1977 ptep, pte);
cl349@2495 1978
cl349@2512 1979 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
cl349@1894 1980 return;
cl349@1894 1981
cl349@2512 1982 if ( __get_user(pte, (unsigned long *)
cl349@2512 1983 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
cl349@2491 1984 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
cl349@2512 1985 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
cl349@2491 1986 domain_crash();
cl349@2414 1987 }
cl349@1894 1988 pfn = pte >> PAGE_SHIFT;
cl349@1894 1989 page = &frame_table[pfn];
cl349@1894 1990 }
iap10@2479 1991
kaf24@2637 1992 void audit_domain(struct domain *d)
iap10@2479 1993 {
iap10@2595 1994 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
kaf24@2637 1995
kaf24@2637 1996 void adjust (struct pfn_info *page, int dir, int adjtype)
iap10@2479 1997 {
cl349@2491 1998 int count = page->count_info & PGC_count_mask;
iap10@2479 1999
cl349@2491 2000 if ( adjtype )
cl349@2491 2001 {
cl349@2491 2002 int tcount = page->u.inuse.type_info & PGT_count_mask;
cl349@2491 2003
cl349@2491 2004 ttot++;
iap10@2479 2005
cl349@2491 2006 tcount += dir;
iap10@2479 2007
cl349@2491 2008 if ( tcount < 0 )
cl349@2491 2009 {
cl349@2644 2010 /* This will only come out once. */
kaf24@2637 2011 printk("Audit %d: type count whent below zero pfn=%x "
kaf24@2637 2012 "taf=%x otaf=%x\n",
kaf24@2748 2013 d->id, page-frame_table,
cl349@2491 2014 page->u.inuse.type_info,
cl349@2491 2015 page->tlbflush_timestamp);
cl349@2491 2016 }
cl349@2491 2017
cl349@2491 2018 page->u.inuse.type_info =
iap10@2573 2019 (page->u.inuse.type_info & ~PGT_count_mask) |
cl349@2644 2020 (tcount & PGT_count_mask);
cl349@2491 2021 }
iap10@2479 2022
cl349@2491 2023 ctot++;
cl349@2491 2024 count += dir;
cl349@2491 2025 if ( count < 0 )
cl349@2491 2026 {
cl349@2644 2027 /* This will only come out once. */
kaf24@2637 2028 printk("Audit %d: general count whent below zero pfn=%x "
kaf24@2637 2029 "taf=%x otaf=%x\n",
kaf24@2748 2030 d->id, page-frame_table,
cl349@2491 2031 page->u.inuse.type_info,
cl349@2491 2032 page->tlbflush_timestamp);
cl349@2491 2033 }
cl349@2491 2034
cl349@2491 2035 page->count_info =
iap10@2573 2036 (page->count_info & ~PGC_count_mask) |
cl349@2644 2037 (count & PGC_count_mask);
iap10@2479 2038
iap10@2479 2039 }
iap10@2479 2040
kaf24@2637 2041 void scan_for_pfn(struct domain *d, unsigned long xpfn)
iap10@2479 2042 {
kaf24@2637 2043 unsigned long pfn, *pt;
cl349@2491 2044 struct list_head *list_ent;
kaf24@2637 2045 struct pfn_info *page;
cl349@2491 2046 int i;
iap10@2479 2047
iap10@2479 2048 list_ent = d->page_list.next;
cl349@2491 2049 for ( i = 0; (list_ent != &d->page_list); i++ )
cl349@2491 2050 {
cl349@2491 2051 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2052 page = &frame_table[pfn];
cl349@2491 2053
kaf24@2637 2054 switch ( page->u.inuse.type_info & PGT_type_mask )
cl349@2491 2055 {
kaf24@2637 2056 case PGT_l1_page_table:
kaf24@2637 2057 case PGT_l2_page_table:
kaf24@2637 2058 pt = map_domain_mem(pfn<<PAGE_SHIFT);
cl349@2491 2059 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
kaf24@2637 2060 if ( (pt[i] & _PAGE_PRESENT) &&
kaf24@2637 2061 ((pt[i] >> PAGE_SHIFT) == xpfn) )
kaf24@2637 2062 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
kaf24@2748 2063 d->id, i, pfn, page->u.inuse.type_info,
kaf24@2637 2064 page->count_info);
cl349@2491 2065 unmap_domain_mem(pt);
cl349@2491 2066 }
iap10@2479 2067
cl349@2491 2068 list_ent = frame_table[pfn].list.next;
cl349@2491 2069 }
iap10@2479 2070
iap10@2479 2071 }
iap10@2479 2072
kaf24@2637 2073 void scan_for_pfn_remote(unsigned long xpfn)
iap10@2479 2074 {
cl349@2491 2075 struct domain *e;
cl349@2491 2076 for_each_domain ( e )
cl349@2491 2077 scan_for_pfn( e, xpfn );
iap10@2479 2078 }
iap10@2479 2079
iap10@2479 2080 int i;
iap10@2479 2081 unsigned long pfn;
iap10@2479 2082 struct list_head *list_ent;
kaf24@2637 2083 struct pfn_info *page;
iap10@2479 2084
cl349@3036 2085 if ( d != current->domain )
cl349@2491 2086 domain_pause(d);
iap10@2479 2087 synchronise_pagetables(~0UL);
iap10@2479 2088
iap10@2479 2089 printk("pt base=%lx sh_info=%x\n",
cl349@3036 2090 pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT,
cl349@2491 2091 virt_to_page(d->shared_info)-frame_table);
cl349@2491 2092
iap10@2479 2093 spin_lock(&d->page_alloc_lock);
iap10@2479 2094
kaf24@2637 2095 /* PHASE 0 */
iap10@2479 2096
iap10@2479 2097 list_ent = d->page_list.next;
iap10@2479 2098 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2099 {
cl349@2491 2100 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2101 page = &frame_table[pfn];
iap10@2479 2102
cl349@2491 2103 if ( page->u.inuse.domain != d )
cl349@2491 2104 BUG();
iap10@2479 2105
cl349@2491 2106 if ( (page->u.inuse.type_info & PGT_count_mask) >
cl349@2491 2107 (page->count_info & PGC_count_mask) )
cl349@2491 2108 printk("taf > caf %x %x pfn=%lx\n",
cl349@2491 2109 page->u.inuse.type_info, page->count_info, pfn );
iap10@2479 2110
kaf24@2637 2111 #if 0 /* SYSV shared memory pages plus writeable files. */
cl349@2491 2112 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
cl349@2491 2113 (page->u.inuse.type_info & PGT_count_mask) > 1 )
cl349@2491 2114 {
cl349@2491 2115 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
cl349@2491 2116 pfn,
cl349@2491 2117 page->u.inuse.type_info,
cl349@2491 2118 page->count_info );
cl349@2491 2119 scan_for_pfn_remote(pfn);
cl349@2491 2120 }
cl349@2092 2121 #endif
cl349@2491 2122 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
cl349@2491 2123 (page->u.inuse.type_info & PGT_count_mask) > 1 )
cl349@2491 2124 {
cl349@2491 2125 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
cl349@2491 2126 pfn,
cl349@2491 2127 page->u.inuse.type_info,
cl349@2491 2128 page->count_info );
cl349@2491 2129 }
iap10@2479 2130
kaf24@2637 2131 /* Use tlbflush_timestamp to store original type_info. */
cl349@2491 2132 page->tlbflush_timestamp = page->u.inuse.type_info;
iap10@2479 2133
cl349@2491 2134 list_ent = frame_table[pfn].list.next;
iap10@2479 2135 }
iap10@2479 2136
iap10@2479 2137
kaf24@2637 2138 /* PHASE 1 */
iap10@2479 2139
cl349@3036 2140 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], -1, 1);
iap10@2479 2141
iap10@2479 2142 list_ent = d->page_list.next;
iap10@2479 2143 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2144 {
kaf24@2637 2145 unsigned long *pt;
cl349@2491 2146 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2147 page = &frame_table[pfn];
iap10@2479 2148
cl349@2491 2149 if ( page->u.inuse.domain != d )
cl349@2491 2150 BUG();
iap10@2479 2151
cl349@2491 2152 switch ( page->u.inuse.type_info & PGT_type_mask )
cl349@2491 2153 {
cl349@2491 2154 case PGT_l2_page_table:
iap10@2479 2155
cl349@2491 2156 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
cl349@2491 2157 printk("Audit %d: L2 not validated %x\n",
kaf24@2748 2158 d->id, page->u.inuse.type_info);
iap10@2479 2159
cl349@2491 2160 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
cl349@2491 2161 printk("Audit %d: L2 not pinned %x\n",
kaf24@2748 2162 d->id, page->u.inuse.type_info);
cl349@2491 2163 else
cl349@2491 2164 adjust( page, -1, 1 );
cl349@2491 2165
cl349@2491 2166 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2167
cl349@2491 2168 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
cl349@2491 2169 {
cl349@2491 2170 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2171 {
cl349@2491 2172 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2173 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2174
cl349@2491 2175 if ( l1page->u.inuse.domain != d )
cl349@2491 2176 {
kaf24@2637 2177 printk("L2: Skip bizarre page belonging to other "
kaf24@2637 2178 "dom %p\n", l1page->u.inuse.domain);
cl349@2491 2179 continue;
cl349@2491 2180 }
kaf24@2637 2181
kaf24@2637 2182 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
kaf24@2637 2183 PGT_l2_page_table )
kaf24@2637 2184 printk("Audit %d: [%x] Found %s Linear PT "
kaf24@2748 2185 "t=%x pfn=%lx\n", d->id, i,
kaf24@2637 2186 (l1pfn==pfn) ? "Self" : "Other",
kaf24@2637 2187 l1page->u.inuse.type_info,
kaf24@2637 2188 l1pfn);
kaf24@2637 2189 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
kaf24@2637 2190 PGT_l1_page_table )
kaf24@2637 2191 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
kaf24@2748 2192 d->id, i,
kaf24@2637 2193 l1page->u.inuse.type_info,
kaf24@2637 2194 l1pfn);
iap10@2479 2195
kaf24@2637 2196 adjust(l1page, -1, 1);
cl349@2491 2197 }
cl349@2491 2198 }
iap10@2479 2199
cl349@2491 2200 unmap_domain_mem(pt);
iap10@2479 2201
cl349@2491 2202 break;
iap10@2479 2203
iap10@2479 2204
cl349@2491 2205 case PGT_l1_page_table:
cl349@2491 2206
cl349@2491 2207 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
cl349@2491 2208 adjust( page, -1, 1 );
iap10@2479 2209
cl349@2491 2210 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
cl349@2491 2211 printk("Audit %d: L1 not validated %x\n",
kaf24@2748 2212 d->id, page->u.inuse.type_info);
iap10@2479 2213 #if 0
cl349@2491 2214 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
cl349@2491 2215 printk("Audit %d: L1 not pinned %x\n",
kaf24@2748 2216 d->id, page->u.inuse.type_info);
iap10@2479 2217 #endif
cl349@2491 2218 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2219
cl349@2491 2220 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
cl349@2491 2221 {
cl349@2491 2222 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2223 {
cl349@2491 2224 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2225 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2226
cl349@2644 2227 if ( l1pfn < 0x100 )
cl349@2644 2228 {
cl349@2644 2229 lowmem_mappings++;
cl349@2644 2230 continue;
cl349@2644 2231 }
iap10@2595 2232
cl349@2644 2233 if ( l1pfn > max_page )
cl349@2644 2234 {
cl349@2644 2235 io_mappings++;
cl349@2644 2236 continue;
cl349@2644 2237 }
iap10@2595 2238
cl349@2491 2239 if ( pt[i] & _PAGE_RW )
cl349@2491 2240 {
iap10@2479 2241
cl349@2491 2242 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
cl349@2491 2243 PGT_l1_page_table ||
cl349@2491 2244 (l1page->u.inuse.type_info & PGT_type_mask) ==
cl349@2491 2245 PGT_l2_page_table )
cl349@2491 2246 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
kaf24@2748 2247 d->id, i,
cl349@2491 2248 l1page->u.inuse.type_info,
cl349@2491 2249 l1pfn);
iap10@2479 2250
cl349@2491 2251 }
iap10@2479 2252
cl349@2491 2253 if ( l1page->u.inuse.domain != d )
cl349@2491 2254 {
kaf24@2637 2255 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
kaf24@2637 2256 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
kaf24@2748 2257 d->id, pfn, i,
iap10@2595 2258 (unsigned long)l1page->u.inuse.domain,
cl349@2644 2259 l1pfn,
cl349@2644 2260 l1page->count_info,
cl349@2644 2261 l1page->u.inuse.type_info,
cl349@2644 2262 machine_to_phys_mapping[l1pfn]);
cl349@2491 2263 continue;
cl349@2491 2264 }
iap10@2479 2265
kaf24@2637 2266 adjust(l1page, -1, 0);
cl349@2491 2267 }
cl349@2491 2268 }
iap10@2479 2269
cl349@2491 2270 unmap_domain_mem(pt);
iap10@2479 2271
cl349@2491 2272 break;
iap10@2595 2273 }
iap10@2479 2274
cl349@2491 2275 list_ent = frame_table[pfn].list.next;
iap10@2479 2276 }
iap10@2479 2277
kaf24@2637 2278 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
cl349@2644 2279 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
kaf24@2748 2280 d->id, lowmem_mappings, io_mappings);
iap10@2595 2281
kaf24@2637 2282 /* PHASE 2 */
iap10@2479 2283
iap10@2479 2284 ctot = ttot = 0;
iap10@2479 2285 list_ent = d->page_list.next;
iap10@2479 2286 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2287 {
cl349@2491 2288 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2289 page = &frame_table[pfn];
iap10@2479 2290
cl349@2491 2291 switch ( page->u.inuse.type_info & PGT_type_mask)
cl349@2491 2292 {
cl349@2491 2293 case PGT_l1_page_table:
cl349@2491 2294 case PGT_l2_page_table:
cl349@2491 2295 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
cl349@2491 2296 {
cl349@2491 2297 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
kaf24@2748 2298 d->id, page->u.inuse.type_info,
cl349@2491 2299 page->tlbflush_timestamp,
cl349@2491 2300 page->count_info, pfn );
cl349@2491 2301 scan_for_pfn_remote(pfn);
cl349@2491 2302 }
cl349@2491 2303 default:
cl349@2491 2304 if ( (page->count_info & PGC_count_mask) != 1 )
cl349@2491 2305 {
kaf24@2637 2306 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
kaf24@2748 2307 d->id,
cl349@2491 2308 page->count_info,
cl349@2491 2309 page->u.inuse.type_info,
cl349@2491 2310 page->tlbflush_timestamp, pfn );
cl349@2491 2311 scan_for_pfn_remote(pfn);
cl349@2491 2312 }
cl349@2491 2313 break;
cl349@2491 2314 }
iap10@2479 2315
cl349@2491 2316 list_ent = frame_table[pfn].list.next;
iap10@2479 2317 }
iap10@2479 2318
kaf24@2637 2319 /* PHASE 3 */
iap10@2479 2320
iap10@2479 2321 list_ent = d->page_list.next;
iap10@2479 2322 for ( i = 0; (list_ent != &d->page_list); i++ )
iap10@2479 2323 {
kaf24@2637 2324 unsigned long *pt;
cl349@2491 2325 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
cl349@2491 2326 page = &frame_table[pfn];
iap10@2479 2327
cl349@2491 2328 switch ( page->u.inuse.type_info & PGT_type_mask )
cl349@2491 2329 {
cl349@2491 2330 case PGT_l2_page_table:
cl349@2491 2331 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
cl349@2491 2332 adjust( page, 1, 1 );
iap10@2479 2333
cl349@2491 2334 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2335
cl349@2491 2336 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
cl349@2491 2337 {
cl349@2491 2338 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2339 {
cl349@2491 2340 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2341 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2342
cl349@2491 2343 if ( l1page->u.inuse.domain == d)
kaf24@2637 2344 adjust(l1page, 1, 1);
cl349@2491 2345 }
cl349@2491 2346 }
iap10@2479 2347
cl349@2491 2348 unmap_domain_mem(pt);
cl349@2491 2349 break;
iap10@2479 2350
cl349@2491 2351 case PGT_l1_page_table:
cl349@2491 2352 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
cl349@2491 2353 adjust( page, 1, 1 );
iap10@2479 2354
cl349@2491 2355 pt = map_domain_mem( pfn<<PAGE_SHIFT );
iap10@2479 2356
cl349@2491 2357 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
cl349@2491 2358 {
cl349@2491 2359 if ( pt[i] & _PAGE_PRESENT )
cl349@2491 2360 {
cl349@2491 2361 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
cl349@2491 2362 struct pfn_info *l1page = &frame_table[l1pfn];
iap10@2479 2363
kaf24@2637 2364 if ( (l1page->u.inuse.domain != d) ||
kaf24@2637 2365 (l1pfn < 0x100) || (l1pfn > max_page) )
kaf24@2637 2366 continue;
iap10@2595 2367
cl349@2644 2368 adjust(l1page, 1, 0);
cl349@2491 2369 }
cl349@2491 2370 }
iap10@2479 2371
cl349@2491 2372 unmap_domain_mem(pt);
cl349@2491 2373 break;
cl349@2491 2374 }
iap10@2479 2375
iap10@2479 2376
kaf24@2637 2377 page->tlbflush_timestamp = 0;
iap10@2479 2378
cl349@2491 2379 list_ent = frame_table[pfn].list.next;
iap10@2479 2380 }
iap10@2479 2381
iap10@2479 2382 spin_unlock(&d->page_alloc_lock);
iap10@2479 2383
cl349@3036 2384 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], 1, 1);
iap10@2479 2385
kaf24@2748 2386 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
iap10@2479 2387
cl349@3036 2388 if ( d != current->domain )
cl349@2491 2389 domain_unpause(d);
iap10@2479 2390 }
iap10@2479 2391
cl349@2491 2392 void audit_domains(void)
iap10@2479 2393 {
iap10@2479 2394 struct domain *d;
iap10@2479 2395 for_each_domain ( d )
cl349@2644 2396 audit_domain(d);
iap10@2479 2397 }
iap10@2479 2398
kaf24@2842 2399 void audit_domains_key(unsigned char key)
iap10@2479 2400 {
kaf24@2842 2401 audit_domains();
iap10@2479 2402 }
iap10@2479 2403
iap10@2479 2404 #endif