debuggers.hg

view xen/common/memory.c @ 650:729e25262c44

bitkeeper revision 1.343 (3f1185f8LtpFl0Oco6NKylLtg11l2A)

misc fixes
author iap10@labyrinth.cl.cam.ac.uk
date Sun Jul 13 16:16:56 2003 +0000 (2003-07-13)
parents 6879a4610638
children 816756811961
line source
1 /******************************************************************************
2 * memory.c
3 *
4 * Copyright (c) 2002 K A Fraser
5 *
6 * A description of the page table API:
7 *
8 * Domains trap to process_page_updates with a list of update requests.
9 * This is a list of (ptr, val) pairs, where the requested operation
10 * is *ptr = val.
11 *
12 * Reference counting of pages:
13 * ----------------------------
14 * Each page has two refcounts: tot_count and type_count.
15 *
16 * TOT_COUNT is the obvious reference count. It counts all uses of a
17 * physical page frame by a domain, including uses as a page directory,
18 * a page table, or simple mappings via a PTE. This count prevents a
19 * domain from releasing a frame back to the hypervisor's free pool when
20 * it is still referencing it!
21 *
22 * TYPE_COUNT is more subtle. A frame can be put to one of three
23 * mutually-exclusive uses: it might be used as a page directory, or a
24 * page table, or it may be mapped writeable by the domain [of course, a
25 * frame may not be used in any of these three ways!].
26 * So, type_count is a count of the number of times a frame is being
27 * referred to in its current incarnation. Therefore, a page can only
28 * change its type when its type count is zero.
29 *
30 * Pinning the page type:
31 * ----------------------
32 * The type of a page can be pinned/unpinned with the commands
33 * PGEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
34 * pinning is not reference counted, so it can't be nested).
35 * This is useful to prevent a page's type count falling to zero, at which
36 * point safety checks would need to be carried out next time the count
37 * is increased again.
38 *
39 * A further note on writeable page mappings:
40 * ------------------------------------------
41 * For simplicity, the count of writeable mappings for a page may not
42 * correspond to reality. The 'writeable count' is incremented for every
43 * PTE which maps the page with the _PAGE_RW flag set. However, for
44 * write access to be possible the page directory entry must also have
45 * its _PAGE_RW bit set. We do not check this as it complicates the
46 * reference counting considerably [consider the case of multiple
47 * directory entries referencing a single page table, some with the RW
48 * bit set, others not -- it starts getting a bit messy].
49 * In normal use, this simplification shouldn't be a problem.
50 * However, the logic can be added if required.
51 *
52 * One more note on read-only page mappings:
53 * -----------------------------------------
54 * We want domains to be able to map pages for read-only access. The
55 * main reason is that page tables and directories should be readable
56 * by a domain, but it would not be safe for them to be writeable.
57 * However, domains have free access to rings 1 & 2 of the Intel
58 * privilege model. In terms of page protection, these are considered
59 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
60 * read-only restrictions are respected in supervisor mode -- if the
61 * bit is clear then any mapped page is writeable.
62 *
63 * We get round this by always setting the WP bit and disallowing
64 * updates to it. This is very unlikely to cause a problem for guest
65 * OS's, which will generally use the WP bit to simplify copy-on-write
66 * implementation (in that case, OS wants a fault when it writes to
67 * an application-supplied buffer).
68 */
71 /*
72 * THE FOLLOWING ARE ISSUES IF GUEST OPERATING SYSTEMS BECOME SMP-CAPABLE.
73 * [THAT IS, THEY'RE NOT A PROBLEM NOW, AND MAY NOT EVER BE.]
74 * -----------------------------------------------------------------------
75 *
76 * *********
77 * UPDATE 15/7/02: Interface has changed --updates now specify physical
78 * address of page-table entry, rather than specifying a virtual address,
79 * so hypervisor no longer "walks" the page tables. Therefore the
80 * solution below cannot work. Another possibility is to add a new entry
81 * to our "struct page" which says to which top-level page table each
82 * lower-level page table or writeable mapping belongs. If it belongs to more
83 * than one, we'd probably just flush on all processors running the domain.
84 * *********
85 *
86 * ** 1 **
87 * The problem involves creating new page tables which might be mapped
88 * writeable in the TLB of another processor. As an example, a domain might be
89 * running in two contexts (ie. on two processors) simultaneously, using the
90 * same top-level page table in both contexts. Now, if context 1 sends an
91 * update request [make page P read-only, add a reference to page P as a page
92 * table], that will succeed if there was only one writeable mapping of P.
93 * However, that mapping may persist in the TLB of context 2.
94 *
95 * Solution: when installing a new page table, we must flush foreign TLBs as
96 * necessary. Naive solution is to flush on any processor running our domain.
97 * Cleverer solution is to flush on any processor running same top-level page
98 * table, but this will sometimes fail (consider two different top-level page
99 * tables which have a shared lower-level page table).
100 *
101 * A better solution: when squashing a write reference, check how many times
102 * that lowest-level table entry is referenced by ORing refcounts of tables
103 * down the page-table hierarchy. If results is != 1, we require flushing all
104 * instances of current domain if a new table is installed (because the
105 * lowest-level entry may be referenced by many top-level page tables).
106 * However, common case will be that result == 1, so we only need to flush
107 * processors with the same top-level page table. Make choice at
108 * table-installation time based on a `flush_level' flag, which is
109 * FLUSH_NONE, FLUSH_PAGETABLE, FLUSH_DOMAIN. A flush reduces this
110 * to FLUSH_NONE, while squashed write mappings can only promote up
111 * to more aggressive flush types.
112 *
113 * ** 2 **
114 * Same problem occurs when removing a page table, at level 1 say, then
115 * making it writeable. Need a TLB flush between otherwise another processor
116 * might write an illegal mapping into the old table, while yet another
117 * processor can use the illegal mapping because of a stale level-2 TLB
118 * entry. So, removal of a table reference sets 'flush_level' appropriately,
119 * and a flush occurs on next addition of a fresh write mapping.
120 *
121 * BETTER SOLUTION FOR BOTH 1 AND 2:
122 * When type_refcnt goes to zero, leave old type in place (don't set to
123 * PGT_none). Then, only flush if making a page table of a page with
124 * (cnt=0,type=PGT_writeable), or when adding a write mapping for a page
125 * with (cnt=0, type=PGT_pagexxx). A TLB flush will cause all pages
126 * with refcnt==0 to be reset to PGT_none. Need an array for the purpose,
127 * added to when a type_refcnt goes to zero, and emptied on a TLB flush.
128 * Either have per-domain table, or force TLB flush at end of each
129 * call to 'process_page_updates'.
130 * Most OSes will always keep a writeable reference hanging around, and
131 * page table structure is fairly static, so this mechanism should be
132 * fairly cheap.
133 *
134 * MAYBE EVEN BETTER? [somewhat dubious: not for first cut of the code]:
135 * If we need to force an intermediate flush, those other processors
136 * spin until we complete, then do a single TLB flush. They can spin on
137 * the lock protecting 'process_page_updates', and continue when that
138 * is freed. Saves cost of setting up and servicing an IPI: later
139 * communication is synchronous. Processors trying to install the domain
140 * or domain&pagetable would also enter the spin.
141 *
142 * ** 3 **
143 * Indeed, this problem generalises to reusing page tables at different
144 * levels of the hierarchy (conceptually, the guest OS can use the
145 * hypervisor to introduce illegal table entries by proxy). Consider
146 * unlinking a level-1 page table and reintroducing at level 2 with no
147 * TLB flush. Hypervisor can add a reference to some other level-1 table
148 * with the RW bit set. This is fine in the level-2 context, but some
149 * other processor may still be using that table in level-1 context
150 * (due to a stale TLB entry). At level 1 it may look like the
151 * processor has write access to the other level-1 page table! Therefore
152 * can add illegal values there with impunity :-(
153 *
154 * Fortunately, the solution above generalises to this extended problem.
155 */
157 /*
158 * UPDATE 12.11.02.: We no longer have struct page and mem_map. These
159 * have been replaced by struct pfn_info and frame_table respectively.
160 *
161 * system_free_list is a list_head linking all system owned free pages.
162 * it is initialized in init_frametable.
163 *
164 * Boris Dragovic.
165 */
167 #include <xeno/config.h>
168 #include <xeno/init.h>
169 #include <xeno/lib.h>
170 #include <xeno/mm.h>
171 #include <xeno/sched.h>
172 #include <xeno/errno.h>
173 #include <asm/page.h>
174 #include <asm/flushtlb.h>
175 #include <asm/io.h>
176 #include <asm/uaccess.h>
177 #include <asm/domain_page.h>
179 #if 0
180 #define MEM_LOG(_f, _a...) printk("DOM%d: (file=memory.c, line=%d) " _f "\n", current->domain, __LINE__, ## _a )
181 #else
182 #define MEM_LOG(_f, _a...) ((void)0)
183 #endif
185 /* Domain 0 is allowed to submit requests on behalf of others. */
186 #define DOMAIN_OKAY(_f) \
187 ((((_f) & PG_domain_mask) == current->domain) || (current->domain == 0))
189 /* 'get' checks parameter for validity before inc'ing refcnt. */
190 static int get_l2_table(unsigned long page_nr);
191 static int get_l1_table(unsigned long page_nr);
192 static int get_page(unsigned long page_nr, int writeable);
193 static int inc_page_refcnt(unsigned long page_nr, unsigned int type);
194 /* 'put' does no checking because if refcnt not zero, entity must be valid. */
195 static void put_l2_table(unsigned long page_nr);
196 static void put_l1_table(unsigned long page_nr);
197 static void put_page(unsigned long page_nr, int writeable);
198 static int dec_page_refcnt(unsigned long page_nr, unsigned int type);
200 static int mod_l2_entry(unsigned long, l2_pgentry_t);
201 static int mod_l1_entry(unsigned long, l1_pgentry_t);
203 /* frame table size and its size in pages */
204 frame_table_t * frame_table;
205 unsigned long frame_table_size;
206 unsigned long max_page;
208 struct list_head free_list;
209 spinlock_t free_list_lock = SPIN_LOCK_UNLOCKED;
210 unsigned int free_pfns;
212 /* Used to defer flushing of memory structures. */
213 static int flush_tlb[NR_CPUS] __cacheline_aligned;
216 /*
217 * init_frametable:
218 * Initialise per-frame memory information. This goes directly after
219 * MAX_MONITOR_ADDRESS in physical memory.
220 */
221 void __init init_frametable(unsigned long nr_pages)
222 {
223 struct pfn_info *pf;
224 unsigned long page_index;
225 unsigned long flags;
227 memset(flush_tlb, 0, sizeof(flush_tlb));
229 max_page = nr_pages;
230 frame_table_size = nr_pages * sizeof(struct pfn_info);
231 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
232 frame_table = (frame_table_t *)FRAMETABLE_VIRT_START;
233 memset(frame_table, 0, frame_table_size);
235 free_pfns = 0;
237 /* Put all domain-allocatable memory on a free list. */
238 spin_lock_irqsave(&free_list_lock, flags);
239 INIT_LIST_HEAD(&free_list);
240 for( page_index = (__pa(frame_table) + frame_table_size) >> PAGE_SHIFT;
241 page_index < nr_pages;
242 page_index++ )
243 {
244 pf = list_entry(&frame_table[page_index].list, struct pfn_info, list);
245 list_add_tail(&pf->list, &free_list);
246 free_pfns++;
247 }
248 spin_unlock_irqrestore(&free_list_lock, flags);
249 }
252 static void __invalidate_shadow_ldt(void)
253 {
254 int i;
255 unsigned long pfn;
256 struct pfn_info *page;
258 current->mm.shadow_ldt_mapcnt = 0;
260 for ( i = 16; i < 32; i++ )
261 {
262 pfn = l1_pgentry_to_pagenr(current->mm.perdomain_pt[i]);
263 if ( pfn == 0 ) continue;
264 current->mm.perdomain_pt[i] = mk_l1_pgentry(0);
265 page = frame_table + pfn;
266 ASSERT((page->flags & PG_type_mask) == PGT_ldt_page);
267 ASSERT((page->flags & PG_domain_mask) == current->domain);
268 ASSERT((page->type_count != 0) && (page->tot_count != 0));
269 put_page_type(page);
270 put_page_tot(page);
271 }
272 }
273 static inline void invalidate_shadow_ldt(void)
274 {
275 if ( current->mm.shadow_ldt_mapcnt != 0 )
276 __invalidate_shadow_ldt();
277 }
280 /* Return original refcnt, or -1 on error. */
281 static int inc_page_refcnt(unsigned long page_nr, unsigned int type)
282 {
283 struct pfn_info *page;
284 unsigned long flags;
286 if ( page_nr >= max_page )
287 {
288 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
289 return -1;
290 }
291 page = frame_table + page_nr;
292 flags = page->flags;
293 if ( !DOMAIN_OKAY(flags) )
294 {
295 MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
296 return -1;
297 }
298 if ( (flags & PG_type_mask) != type )
299 {
300 if ( page_type_count(page) != 0 )
301 {
302 MEM_LOG("Page %08lx bad type/count (%08lx!=%08x) cnt=%ld",
303 page_nr << PAGE_SHIFT,
304 flags & PG_type_mask, type, page_type_count(page));
305 return -1;
306 }
308 page->flags &= ~PG_type_mask;
309 page->flags |= type;
310 }
312 get_page_tot(page);
313 return get_page_type(page);
314 }
317 /* Return new refcnt, or -1 on error. */
318 static int dec_page_refcnt(unsigned long page_nr, unsigned int type)
319 {
320 struct pfn_info *page;
322 if ( page_nr >= max_page )
323 {
324 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
325 return -1;
326 }
327 page = frame_table + page_nr;
328 if ( !DOMAIN_OKAY(page->flags) ||
329 ((page->flags & PG_type_mask) != type) )
330 {
331 MEM_LOG("Bad page type/domain (dom=%ld) (type %ld != expected %d)",
332 page->flags & PG_domain_mask, page->flags & PG_type_mask,
333 type);
334 return -1;
335 }
336 ASSERT(page_type_count(page) != 0);
337 put_page_tot(page);
338 return put_page_type(page);
339 }
342 /* We allow a L2 table to map itself, to achieve a linear pagetable. */
343 /* NB. There's no need for a put_twisted_l2_table() function!! */
344 static int get_twisted_l2_table(unsigned long entry_pfn, l2_pgentry_t l2e)
345 {
346 unsigned long l2v = l2_pgentry_val(l2e);
348 /* Clearly the mapping must be read-only :-) */
349 if ( (l2v & _PAGE_RW) )
350 {
351 MEM_LOG("Attempt to install twisted L2 entry with write permissions");
352 return -1;
353 }
355 /* This is a sufficient final check. */
356 if ( (l2v >> PAGE_SHIFT) != entry_pfn )
357 {
358 MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
359 return -1;
360 }
362 /* We don't bump the reference counts. */
363 return 0;
364 }
367 static int get_l2_table(unsigned long page_nr)
368 {
369 l2_pgentry_t *p_l2_entry, l2_entry;
370 int i, ret=0;
372 ret = inc_page_refcnt(page_nr, PGT_l2_page_table);
373 if ( ret != 0 ) return (ret < 0) ? ret : 0;
375 /* NEW level-2 page table! Deal with every PDE in the table. */
376 p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
377 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
378 {
379 l2_entry = *p_l2_entry++;
380 if ( !(l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) continue;
381 if ( (l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE)) )
382 {
383 MEM_LOG("Bad L2 page type settings %04lx",
384 l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE));
385 ret = -1;
386 goto out;
387 }
388 /* Assume we're mapping an L1 table, falling back to twisted L2. */
389 ret = get_l1_table(l2_pgentry_to_pagenr(l2_entry));
390 if ( ret ) ret = get_twisted_l2_table(page_nr, l2_entry);
391 if ( ret ) goto out;
392 }
394 /* Now we simply slap in our high mapping. */
395 memcpy(p_l2_entry,
396 idle_pg_table[smp_processor_id()] + DOMAIN_ENTRIES_PER_L2_PAGETABLE,
397 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
398 p_l2_entry[(PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT) -
399 DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
400 mk_l2_pgentry(__pa(current->mm.perdomain_pt) | __PAGE_HYPERVISOR);
402 out:
403 unmap_domain_mem(p_l2_entry);
404 return ret;
405 }
408 static int get_l1_table(unsigned long page_nr)
409 {
410 l1_pgentry_t *p_l1_entry, l1_entry;
411 int i, ret;
413 /* Update ref count for page pointed at by PDE. */
414 ret = inc_page_refcnt(page_nr, PGT_l1_page_table);
415 if ( ret != 0 ) return (ret < 0) ? ret : 0;
417 /* NEW level-1 page table! Deal with every PTE in the table. */
418 p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
419 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
420 {
421 l1_entry = *p_l1_entry++;
422 if ( !(l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) continue;
423 if ( (l1_pgentry_val(l1_entry) &
424 (_PAGE_GLOBAL|_PAGE_PAT)) )
425 {
426 MEM_LOG("Bad L1 page type settings %04lx",
427 l1_pgentry_val(l1_entry) &
428 (_PAGE_GLOBAL|_PAGE_PAT));
429 ret = -1;
430 goto out;
431 }
432 ret = get_page(l1_pgentry_to_pagenr(l1_entry),
433 l1_pgentry_val(l1_entry) & _PAGE_RW);
434 if ( ret ) goto out;
435 }
437 out:
438 /* Make sure we unmap the right page! */
439 unmap_domain_mem(p_l1_entry-1);
440 return ret;
441 }
444 static int get_page(unsigned long page_nr, int writeable)
445 {
446 struct pfn_info *page;
447 unsigned long flags;
449 /* Update ref count for page pointed at by PTE. */
450 if ( page_nr >= max_page )
451 {
452 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
453 return(-1);
454 }
455 page = frame_table + page_nr;
456 flags = page->flags;
457 if ( !DOMAIN_OKAY(flags) )
458 {
459 MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
460 return(-1);
461 }
463 if ( writeable )
464 {
465 if ( (flags & PG_type_mask) != PGT_writeable_page )
466 {
467 if ( page_type_count(page) != 0 )
468 {
469 MEM_LOG("Bad page type/count (%08lx!=%08x) cnt=%ld",
470 flags & PG_type_mask, PGT_writeable_page,
471 page_type_count(page));
472 return(-1);
473 }
474 page->flags &= ~PG_type_mask;
475 page->flags |= PGT_writeable_page;
476 }
477 page->flags |= PG_need_flush;
478 get_page_type(page);
479 }
481 get_page_tot(page);
483 return(0);
484 }
487 static void put_l2_table(unsigned long page_nr)
488 {
489 l2_pgentry_t *p_l2_entry, l2_entry;
490 int i;
492 if ( dec_page_refcnt(page_nr, PGT_l2_page_table) ) return;
494 /* We had last reference to level-2 page table. Free the PDEs. */
495 p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
496 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
497 {
498 l2_entry = *p_l2_entry++;
499 if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
500 put_l1_table(l2_pgentry_to_pagenr(l2_entry));
501 }
503 unmap_domain_mem(p_l2_entry);
504 }
507 static void put_l1_table(unsigned long page_nr)
508 {
509 l1_pgentry_t *p_l1_entry, l1_entry;
510 int i;
512 if ( dec_page_refcnt(page_nr, PGT_l1_page_table) ) return;
514 /* We had last reference to level-1 page table. Free the PTEs. */
515 p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
516 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
517 {
518 l1_entry = *p_l1_entry++;
519 if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) )
520 {
521 put_page(l1_pgentry_to_pagenr(l1_entry),
522 l1_pgentry_val(l1_entry) & _PAGE_RW);
523 }
524 }
526 /* Make sure we unmap the right page! */
527 unmap_domain_mem(p_l1_entry-1);
528 }
531 static void put_page(unsigned long page_nr, int writeable)
532 {
533 struct pfn_info *page;
534 ASSERT(page_nr < max_page);
535 page = frame_table + page_nr;
536 ASSERT(DOMAIN_OKAY(page->flags));
537 ASSERT((!writeable) ||
538 ((page_type_count(page) != 0) &&
539 ((page->flags & PG_type_mask) == PGT_writeable_page) &&
540 ((page->flags & PG_need_flush) == PG_need_flush)));
541 if ( writeable )
542 {
543 if ( put_page_type(page) == 0 )
544 {
545 flush_tlb[smp_processor_id()] = 1;
546 page->flags &= ~PG_need_flush;
547 }
548 }
549 else if ( unlikely(((page->flags & PG_type_mask) == PGT_ldt_page) &&
550 (page_type_count(page) != 0)) )
551 {
552 /* We expect this is rare so we just blow the entire shadow LDT. */
553 invalidate_shadow_ldt();
554 }
555 put_page_tot(page);
556 }
559 static int mod_l2_entry(unsigned long pa, l2_pgentry_t new_l2_entry)
560 {
561 l2_pgentry_t *p_l2_entry, old_l2_entry;
563 p_l2_entry = map_domain_mem(pa);
564 old_l2_entry = *p_l2_entry;
566 if ( (((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >=
567 DOMAIN_ENTRIES_PER_L2_PAGETABLE )
568 {
569 MEM_LOG("Illegal L2 update attempt in hypervisor area %p",
570 p_l2_entry);
571 goto fail;
572 }
574 if ( (l2_pgentry_val(new_l2_entry) & _PAGE_PRESENT) )
575 {
576 if ( (l2_pgentry_val(new_l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE)) )
577 {
578 MEM_LOG("Bad L2 entry val %04lx",
579 l2_pgentry_val(new_l2_entry) &
580 (_PAGE_GLOBAL|_PAGE_PSE));
581 goto fail;
582 }
583 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
584 if ( ((l2_pgentry_val(old_l2_entry) ^
585 l2_pgentry_val(new_l2_entry)) & 0xfffff001) != 0 )
586 {
587 if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
588 {
589 put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
590 }
592 /* Assume we're mapping an L1 table, falling back to twisted L2. */
593 if ( get_l1_table(l2_pgentry_to_pagenr(new_l2_entry)) &&
594 get_twisted_l2_table(pa >> PAGE_SHIFT, new_l2_entry) )
595 goto fail;
596 }
597 }
598 else if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
599 {
600 put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
601 }
603 *p_l2_entry = new_l2_entry;
604 unmap_domain_mem(p_l2_entry);
605 return 0;
607 fail:
608 unmap_domain_mem(p_l2_entry);
609 return -1;
610 }
613 static int mod_l1_entry(unsigned long pa, l1_pgentry_t new_l1_entry)
614 {
615 l1_pgentry_t *p_l1_entry, old_l1_entry;
617 p_l1_entry = map_domain_mem(pa);
618 old_l1_entry = *p_l1_entry;
620 if ( (l1_pgentry_val(new_l1_entry) & _PAGE_PRESENT) )
621 {
622 if ( (l1_pgentry_val(new_l1_entry) &
623 (_PAGE_GLOBAL|_PAGE_PAT)) )
624 {
626 MEM_LOG("Bad L1 entry val %04lx",
627 l1_pgentry_val(new_l1_entry) &
628 (_PAGE_GLOBAL|_PAGE_PAT));
629 goto fail;
630 }
631 /*
632 * Differ in mapping (bits 12-31), writeable (bit 1), or
633 * presence (bit 0)?
634 */
635 if ( ((l1_pgentry_val(old_l1_entry) ^
636 l1_pgentry_val(new_l1_entry)) & 0xfffff003) != 0 )
637 {
638 if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
639 {
640 put_page(l1_pgentry_to_pagenr(old_l1_entry),
641 l1_pgentry_val(old_l1_entry) & _PAGE_RW);
642 }
644 if ( get_page(l1_pgentry_to_pagenr(new_l1_entry),
645 l1_pgentry_val(new_l1_entry) & _PAGE_RW) ){
646 goto fail;
647 }
648 }
649 }
650 else if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
651 {
652 put_page(l1_pgentry_to_pagenr(old_l1_entry),
653 l1_pgentry_val(old_l1_entry) & _PAGE_RW);
654 }
656 *p_l1_entry = new_l1_entry;
657 unmap_domain_mem(p_l1_entry);
658 return 0;
660 fail:
661 unmap_domain_mem(p_l1_entry);
662 return -1;
663 }
666 static int do_extended_command(unsigned long ptr, unsigned long val)
667 {
668 int err = 0;
669 unsigned int cmd = val & PGEXT_CMD_MASK;
670 unsigned long pfn = ptr >> PAGE_SHIFT;
671 struct pfn_info *page = frame_table + pfn;
673 /* 'ptr' must be in range except where it isn't a machine address. */
674 if ( (pfn >= max_page) && (cmd != PGEXT_SET_LDT) )
675 return 1;
677 switch ( cmd )
678 {
679 case PGEXT_PIN_L1_TABLE:
680 err = get_l1_table(pfn);
681 goto mark_as_pinned;
682 case PGEXT_PIN_L2_TABLE:
683 err = get_l2_table(pfn);
684 mark_as_pinned:
685 if ( err )
686 {
687 MEM_LOG("Error while pinning pfn %08lx", pfn);
688 break;
689 }
690 put_page_type(page);
691 put_page_tot(page);
692 if ( !(page->type_count & REFCNT_PIN_BIT) )
693 {
694 page->type_count |= REFCNT_PIN_BIT;
695 page->tot_count |= REFCNT_PIN_BIT;
696 }
697 else
698 {
699 MEM_LOG("Pfn %08lx already pinned", pfn);
700 err = 1;
701 }
702 break;
704 case PGEXT_UNPIN_TABLE:
705 if ( !DOMAIN_OKAY(page->flags) )
706 {
707 err = 1;
708 MEM_LOG("Page %08lx bad domain (dom=%ld)",
709 ptr, page->flags & PG_domain_mask);
710 }
711 else if ( (page->type_count & REFCNT_PIN_BIT) )
712 {
713 page->type_count &= ~REFCNT_PIN_BIT;
714 page->tot_count &= ~REFCNT_PIN_BIT;
715 get_page_type(page);
716 get_page_tot(page);
717 ((page->flags & PG_type_mask) == PGT_l1_page_table) ?
718 put_l1_table(pfn) : put_l2_table(pfn);
719 }
720 else
721 {
722 err = 1;
723 MEM_LOG("Pfn %08lx not pinned", pfn);
724 }
725 break;
727 case PGEXT_NEW_BASEPTR:
728 err = get_l2_table(pfn);
729 if ( !err )
730 {
731 put_l2_table(pagetable_val(current->mm.pagetable) >> PAGE_SHIFT);
732 current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
733 invalidate_shadow_ldt();
734 flush_tlb[smp_processor_id()] = 1;
735 }
736 else
737 {
738 MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err);
739 }
740 break;
742 case PGEXT_TLB_FLUSH:
743 flush_tlb[smp_processor_id()] = 1;
744 break;
746 case PGEXT_INVLPG:
747 __flush_tlb_one(val & ~PGEXT_CMD_MASK);
748 break;
750 case PGEXT_SET_LDT:
751 {
752 unsigned long ents = val >> PGEXT_CMD_SHIFT;
753 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
754 (ents > 8192) ||
755 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
756 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
757 {
758 err = 1;
759 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
760 }
761 else if ( (current->mm.ldt_ents != ents) ||
762 (current->mm.ldt_base != ptr) )
763 {
764 if ( current->mm.ldt_ents != 0 )
765 {
766 invalidate_shadow_ldt();
767 flush_tlb[smp_processor_id()] = 1;
768 }
769 current->mm.ldt_base = ptr;
770 current->mm.ldt_ents = ents;
771 load_LDT();
772 }
773 break;
774 }
776 default:
777 MEM_LOG("Invalid extended pt command 0x%08lx", val & PGEXT_CMD_MASK);
778 err = 1;
779 break;
780 }
782 return err;
783 }
786 int do_process_page_updates(page_update_request_t *ureqs, int count)
787 {
788 page_update_request_t req;
789 unsigned long flags, pfn;
790 struct pfn_info *page;
791 int err = 0, i;
792 unsigned int cmd;
794 for ( i = 0; i < count; i++ )
795 {
796 if ( copy_from_user(&req, ureqs, sizeof(req)) )
797 {
798 kill_domain_with_errmsg("Cannot read page update request");
799 }
801 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
803 /* All normal commands must have 'ptr' in range. */
804 pfn = req.ptr >> PAGE_SHIFT;
805 if ( (pfn >= max_page) && (cmd != PGREQ_EXTENDED_COMMAND) )
806 {
807 MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page);
808 kill_domain_with_errmsg("Page update request out of range");
809 }
811 err = 1;
813 /* Least significant bits of 'ptr' demux the operation type. */
814 spin_lock_irq(&current->page_lock);
815 switch ( cmd )
816 {
817 /*
818 * PGREQ_NORMAL: Normal update to any level of page table.
819 */
820 case PGREQ_NORMAL:
821 page = frame_table + pfn;
822 flags = page->flags;
824 if ( DOMAIN_OKAY(flags) )
825 {
826 switch ( (flags & PG_type_mask) )
827 {
828 case PGT_l1_page_table:
829 err = mod_l1_entry(req.ptr, mk_l1_pgentry(req.val));
830 break;
831 case PGT_l2_page_table:
832 err = mod_l2_entry(req.ptr, mk_l2_pgentry(req.val));
833 break;
834 default:
835 MEM_LOG("Update to non-pt page %08lx", req.ptr);
836 break;
837 }
838 }
839 else
840 {
841 MEM_LOG("Bad domain normal update (dom %d, pfn %ld)",
842 current->domain, pfn);
843 }
844 break;
846 case PGREQ_MPT_UPDATE:
847 page = frame_table + pfn;
848 if ( DOMAIN_OKAY(page->flags) )
849 {
850 machine_to_phys_mapping[pfn] = req.val;
851 err = 0;
852 }
853 else
854 {
855 MEM_LOG("Bad domain MPT update (dom %d, pfn %ld)",
856 current->domain, pfn);
857 }
858 break;
860 /*
861 * PGREQ_EXTENDED_COMMAND: Extended command is specified
862 * in the least-siginificant bits of the 'value' field.
863 */
864 case PGREQ_EXTENDED_COMMAND:
865 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
866 err = do_extended_command(req.ptr, req.val);
867 break;
869 case PGREQ_UNCHECKED_UPDATE:
870 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
871 if ( current->domain == 0 )
872 {
873 unsigned long *ptr = map_domain_mem(req.ptr);
874 *ptr = req.val;
875 unmap_domain_mem(ptr);
876 err = 0;
877 }
878 else
879 {
880 MEM_LOG("Bad unchecked update attempt");
881 }
882 break;
884 default:
885 MEM_LOG("Invalid page update command %08lx", req.ptr);
886 break;
887 }
888 spin_unlock_irq(&current->page_lock);
890 if ( err )
891 {
892 kill_domain_with_errmsg("Illegal page update request");
893 }
895 ureqs++;
896 }
898 if ( flush_tlb[smp_processor_id()] )
899 {
900 flush_tlb[smp_processor_id()] = 0;
901 __write_cr3_counted(pagetable_val(current->mm.pagetable));
903 }
905 return(0);
906 }