debuggers.hg

view xen/arch/x86/x86_64/mm.c @ 3726:88957a238191

bitkeeper revision 1.1159.1.544 (4207248crq3YxiyLWjUehtHv_Yd3tg)

Merge tempest.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xeno.bk
into tempest.cl.cam.ac.uk:/local/scratch/smh22/xen-unstable.bk
author smh22@tempest.cl.cam.ac.uk
date Mon Feb 07 08:19:24 2005 +0000 (2005-02-07)
parents 393483ae9f62 253e8e10e986
children f5f2757b3aa2
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * arch/x86/x86_64/mm.c
4 *
5 * Modifications to Linux original are copyright (c) 2004, K A Fraser
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 #include <xen/config.h>
23 #include <xen/lib.h>
24 #include <xen/init.h>
25 #include <xen/mm.h>
26 #include <asm/page.h>
27 #include <asm/flushtlb.h>
28 #include <asm/fixmap.h>
29 #include <asm/domain_page.h>
31 void *safe_page_alloc(void)
32 {
33 extern int early_boot;
34 if ( early_boot )
35 return __va(alloc_boot_pages(PAGE_SIZE, PAGE_SIZE));
36 return (void *)alloc_xenheap_page();
37 }
39 /* Map physical byte range (@p, @p+@s) at virt address @v in pagetable @pt. */
40 int map_pages(
41 pagetable_t *pt,
42 unsigned long v,
43 unsigned long p,
44 unsigned long s,
45 unsigned long flags)
46 {
47 l4_pgentry_t *pl4e;
48 l3_pgentry_t *pl3e;
49 l2_pgentry_t *pl2e;
50 l1_pgentry_t *pl1e;
51 void *newpg;
53 while ( s != 0 )
54 {
55 pl4e = &pt[l4_table_offset(v)];
56 if ( !(l4_pgentry_val(*pl4e) & _PAGE_PRESENT) )
57 {
58 newpg = safe_page_alloc();
59 clear_page(newpg);
60 *pl4e = mk_l4_pgentry(__pa(newpg) | __PAGE_HYPERVISOR);
61 }
63 pl3e = l4_pgentry_to_l3(*pl4e) + l3_table_offset(v);
64 if ( !(l3_pgentry_val(*pl3e) & _PAGE_PRESENT) )
65 {
66 newpg = safe_page_alloc();
67 clear_page(newpg);
68 *pl3e = mk_l3_pgentry(__pa(newpg) | __PAGE_HYPERVISOR);
69 }
71 pl2e = l3_pgentry_to_l2(*pl3e) + l2_table_offset(v);
73 if ( ((s|v|p) & ((1<<L2_PAGETABLE_SHIFT)-1)) == 0 )
74 {
75 /* Super-page mapping. */
76 if ( (l2_pgentry_val(*pl2e) & _PAGE_PRESENT) )
77 __flush_tlb_pge();
78 *pl2e = mk_l2_pgentry(p|flags|_PAGE_PSE);
80 v += 1 << L2_PAGETABLE_SHIFT;
81 p += 1 << L2_PAGETABLE_SHIFT;
82 s -= 1 << L2_PAGETABLE_SHIFT;
83 }
84 else
85 {
86 /* Normal page mapping. */
87 if ( !(l2_pgentry_val(*pl2e) & _PAGE_PRESENT) )
88 {
89 newpg = safe_page_alloc();
90 clear_page(newpg);
91 *pl2e = mk_l2_pgentry(__pa(newpg) | __PAGE_HYPERVISOR);
92 }
93 pl1e = l2_pgentry_to_l1(*pl2e) + l1_table_offset(v);
94 if ( (l1_pgentry_val(*pl1e) & _PAGE_PRESENT) )
95 __flush_tlb_one(v);
96 *pl1e = mk_l1_pgentry(p|flags);
98 v += 1 << L1_PAGETABLE_SHIFT;
99 p += 1 << L1_PAGETABLE_SHIFT;
100 s -= 1 << L1_PAGETABLE_SHIFT;
101 }
102 }
104 return 0;
105 }
107 void __set_fixmap(
108 enum fixed_addresses idx, unsigned long p, unsigned long flags)
109 {
110 if ( unlikely(idx >= __end_of_fixed_addresses) )
111 BUG();
112 map_pages(idle_pg_table, fix_to_virt(idx), p, PAGE_SIZE, flags);
113 }
116 void __init paging_init(void)
117 {
118 void *newpt;
119 unsigned long i, p, max;
121 /* Map all of physical memory. */
122 max = ((max_page + ENTRIES_PER_L1_PAGETABLE - 1) &
123 ~(ENTRIES_PER_L1_PAGETABLE - 1)) << PAGE_SHIFT;
124 map_pages(idle_pg_table, PAGE_OFFSET, 0, max, PAGE_HYPERVISOR);
126 /*
127 * Allocate and map the machine-to-phys table.
128 * This also ensures L3 is present for ioremap().
129 */
130 for ( i = 0; i < max_page; i += ((1UL << L2_PAGETABLE_SHIFT) / 8) )
131 {
132 p = alloc_boot_pages(1UL << L2_PAGETABLE_SHIFT,
133 1UL << L2_PAGETABLE_SHIFT);
134 if ( p == 0 )
135 panic("Not enough memory for m2p table\n");
136 map_pages(idle_pg_table, RDWR_MPT_VIRT_START + i*8, p,
137 1UL << L2_PAGETABLE_SHIFT, PAGE_HYPERVISOR);
138 memset((void *)(RDWR_MPT_VIRT_START + i*8), 0x55,
139 1UL << L2_PAGETABLE_SHIFT);
140 }
142 /* Create read-only mapping of MPT for guest-OS use. */
143 newpt = (void *)alloc_xenheap_page();
144 clear_page(newpt);
145 idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)] =
146 mk_l4_pgentry((__pa(newpt) | __PAGE_HYPERVISOR | _PAGE_USER) &
147 ~_PAGE_RW);
148 /* Copy the L3 mappings from the RDWR_MPT area. */
149 p = l4_pgentry_val(idle_pg_table[l4_table_offset(RDWR_MPT_VIRT_START)]);
150 p &= PAGE_MASK;
151 p += l3_table_offset(RDWR_MPT_VIRT_START) * sizeof(l3_pgentry_t);
152 newpt = (void *)((unsigned long)newpt +
153 (l3_table_offset(RO_MPT_VIRT_START) *
154 sizeof(l3_pgentry_t)));
155 memcpy(newpt, __va(p),
156 (RDWR_MPT_VIRT_END - RDWR_MPT_VIRT_START) >> L3_PAGETABLE_SHIFT);
158 /* Set up linear page table mapping. */
159 idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)] =
160 mk_l4_pgentry(__pa(idle_pg_table) | __PAGE_HYPERVISOR);
161 }
163 void __init zap_low_mappings(void)
164 {
165 idle_pg_table[0] = mk_l4_pgentry(0);
166 flush_tlb_all_pge();
167 }
169 void subarch_init_memory(struct domain *dom_xen)
170 {
171 unsigned long i, v, m2p_start_mfn;
172 l3_pgentry_t l3e;
173 l2_pgentry_t l2e;
175 /*
176 * We are rather picky about the layout of 'struct pfn_info'. The
177 * count_info and domain fields must be adjacent, as we perform atomic
178 * 64-bit operations on them.
179 */
180 if ( (offsetof(struct pfn_info, u.inuse._domain) !=
181 (offsetof(struct pfn_info, count_info) + sizeof(u32))) )
182 {
183 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
184 offsetof(struct pfn_info, count_info),
185 offsetof(struct pfn_info, u.inuse._domain),
186 sizeof(struct pfn_info));
187 for ( ; ; ) ;
188 }
190 /* M2P table is mappable read-only by privileged domains. */
191 for ( v = RDWR_MPT_VIRT_START;
192 v != RDWR_MPT_VIRT_END;
193 v += 1 << L2_PAGETABLE_SHIFT )
194 {
195 l3e = l4_pgentry_to_l3(idle_pg_table[l4_table_offset(v)])[
196 l3_table_offset(v)];
197 if ( !(l3_pgentry_val(l3e) & _PAGE_PRESENT) )
198 continue;
199 l2e = l3_pgentry_to_l2(l3e)[l2_table_offset(v)];
200 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
201 continue;
202 m2p_start_mfn = l2_pgentry_to_pagenr(l2e);
204 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
205 {
206 frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
207 /* gdt to make sure it's only mapped read-only by non-privileged
208 domains. */
209 frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
210 page_set_owner(&frame_table[m2p_start_mfn+i], dom_xen);
211 }
212 }
213 }
215 /*
216 * Allows shooting down of borrowed page-table use on specific CPUs.
217 * Specifically, we borrow page tables when running the idle domain.
218 */
219 static void __synchronise_pagetables(void *mask)
220 {
221 struct exec_domain *ed = current;
222 if ( ((unsigned long)mask & (1 << ed->processor)) &&
223 is_idle_task(ed->domain) )
224 write_ptbase(ed);
225 }
226 void synchronise_pagetables(unsigned long cpu_mask)
227 {
228 __synchronise_pagetables((void *)cpu_mask);
229 smp_call_function(__synchronise_pagetables, (void *)cpu_mask, 1, 1);
230 }
232 long do_stack_switch(unsigned long ss, unsigned long esp)
233 {
234 if ( (ss & 3) != 3 )
235 return -EPERM;
236 current->arch.guestos_ss = ss;
237 current->arch.guestos_sp = esp;
238 return 0;
239 }
242 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
243 int check_descriptor(unsigned long *d)
244 {
245 unsigned long base, limit, a = d[0], b = d[1];
247 /* A not-present descriptor will always fault, so is safe. */
248 if ( !(b & _SEGMENT_P) )
249 goto good;
251 /*
252 * We don't allow a DPL of zero. There is no legitimate reason for
253 * specifying DPL==0, and it gets rather dangerous if we also accept call
254 * gates (consider a call gate pointing at another guestos descriptor with
255 * DPL 0 -- this would get the OS ring-0 privileges).
256 */
257 if ( (b & _SEGMENT_DPL) == 0 )
258 goto bad;
260 if ( !(b & _SEGMENT_S) )
261 {
262 /*
263 * System segment:
264 * 1. Don't allow interrupt or trap gates as they belong in the IDT.
265 * 2. Don't allow TSS descriptors or task gates as we don't
266 * virtualise x86 tasks.
267 * 3. Don't allow LDT descriptors because they're unnecessary and
268 * I'm uneasy about allowing an LDT page to contain LDT
269 * descriptors. In any case, Xen automatically creates the
270 * required descriptor when reloading the LDT register.
271 * 4. We allow call gates but they must not jump to a private segment.
272 */
274 /* Disallow everything but call gates. */
275 if ( (b & _SEGMENT_TYPE) != 0xc00 )
276 goto bad;
278 #if 0
279 /* Can't allow far jump to a Xen-private segment. */
280 if ( !VALID_CODESEL(a>>16) )
281 goto bad;
282 #endif
284 /* Reserved bits must be zero. */
285 if ( (b & 0xe0) != 0 )
286 goto bad;
288 /* No base/limit check is needed for a call gate. */
289 goto good;
290 }
292 /* Check that base is at least a page away from Xen-private area. */
293 base = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16);
294 if ( base >= (PAGE_OFFSET - PAGE_SIZE) )
295 goto bad;
297 /* Check and truncate the limit if necessary. */
298 limit = (b&0xf0000) | (a&0xffff);
299 limit++; /* We add one because limit is inclusive. */
300 if ( (b & _SEGMENT_G) )
301 limit <<= 12;
303 if ( (b & (_SEGMENT_CODE | _SEGMENT_EC)) == _SEGMENT_EC )
304 {
305 /*
306 * Grows-down limit check.
307 * NB. limit == 0xFFFFF provides no access (if G=1).
308 * limit == 0x00000 provides 4GB-4kB access (if G=1).
309 */
310 if ( (base + limit) > base )
311 {
312 limit = -(base & PAGE_MASK);
313 goto truncate;
314 }
315 }
316 else
317 {
318 /*
319 * Grows-up limit check.
320 * NB. limit == 0xFFFFF provides 4GB access (if G=1).
321 * limit == 0x00000 provides 4kB access (if G=1).
322 */
323 if ( ((base + limit) <= base) ||
324 ((base + limit) > PAGE_OFFSET) )
325 {
326 limit = PAGE_OFFSET - base;
327 truncate:
328 if ( !(b & _SEGMENT_G) )
329 goto bad; /* too dangerous; too hard to work out... */
330 limit = (limit >> 12) - 1;
331 d[0] &= ~0x0ffff; d[0] |= limit & 0x0ffff;
332 d[1] &= ~0xf0000; d[1] |= limit & 0xf0000;
333 }
334 }
336 good:
337 return 1;
338 bad:
339 return 0;
340 }
343 void destroy_gdt(struct exec_domain *ed)
344 {
345 int i;
346 unsigned long pfn;
348 for ( i = 0; i < 16; i++ )
349 {
350 if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 )
351 put_page_and_type(&frame_table[pfn]);
352 ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
353 }
354 }
357 long set_gdt(struct exec_domain *ed,
358 unsigned long *frames,
359 unsigned int entries)
360 {
361 struct domain *d = ed->domain;
362 /* NB. There are 512 8-byte entries per GDT page. */
363 int i = 0, nr_pages = (entries + 511) / 512;
364 struct desc_struct *vgdt;
365 unsigned long pfn;
367 /* Check the first page in the new GDT. */
368 if ( (pfn = frames[0]) >= max_page )
369 goto fail;
371 /* The first page is special because Xen owns a range of entries in it. */
372 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
373 {
374 /* GDT checks failed: try zapping the Xen reserved entries. */
375 if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
376 goto fail;
377 vgdt = map_domain_mem(pfn << PAGE_SHIFT);
378 memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
379 NR_RESERVED_GDT_ENTRIES*8);
380 unmap_domain_mem(vgdt);
381 put_page_and_type(&frame_table[pfn]);
383 /* Okay, we zapped the entries. Now try the GDT checks again. */
384 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
385 goto fail;
386 }
388 /* Check the remaining pages in the new GDT. */
389 for ( i = 1; i < nr_pages; i++ )
390 if ( ((pfn = frames[i]) >= max_page) ||
391 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
392 goto fail;
394 /* Copy reserved GDT entries to the new GDT. */
395 vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
396 memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY,
397 gdt_table + FIRST_RESERVED_GDT_ENTRY,
398 NR_RESERVED_GDT_ENTRIES*8);
399 unmap_domain_mem(vgdt);
401 /* Tear down the old GDT. */
402 destroy_gdt(ed);
404 /* Install the new GDT. */
405 for ( i = 0; i < nr_pages; i++ )
406 ed->arch.perdomain_ptes[i] =
407 mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
409 SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
410 SET_GDT_ENTRIES(ed, entries);
412 return 0;
414 fail:
415 while ( i-- > 0 )
416 put_page_and_type(&frame_table[frames[i]]);
417 return -EINVAL;
418 }
421 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
422 {
423 int nr_pages = (entries + 511) / 512;
424 unsigned long frames[16];
425 long ret;
427 if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) )
428 return -EINVAL;
430 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
431 return -EFAULT;
433 if ( (ret = set_gdt(current, frames, entries)) == 0 )
434 {
435 local_flush_tlb();
436 __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
437 }
439 return ret;
440 }
443 long do_update_descriptor(
444 unsigned long pa, unsigned long word1, unsigned long word2)
445 {
446 unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT, d[2];
447 struct pfn_info *page;
448 long ret = -EINVAL;
450 d[0] = word1;
451 d[1] = word2;
453 if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(d) )
454 return -EINVAL;
456 page = &frame_table[pfn];
457 if ( unlikely(!get_page(page, current->domain)) )
458 return -EINVAL;
460 /* Check if the given frame is in use in an unsafe context. */
461 switch ( page->u.inuse.type_info & PGT_type_mask )
462 {
463 case PGT_gdt_page:
464 /* Disallow updates of Xen-reserved descriptors in the current GDT. */
465 if ( (l1_pgentry_to_pagenr(current->arch.perdomain_ptes[0]) == pfn) &&
466 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
467 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
468 goto out;
469 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
470 goto out;
471 break;
472 case PGT_ldt_page:
473 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
474 goto out;
475 break;
476 default:
477 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
478 goto out;
479 break;
480 }
482 /* All is good so make the update. */
483 gdt_pent = map_domain_mem(pa);
484 memcpy(gdt_pent, d, 8);
485 unmap_domain_mem(gdt_pent);
487 put_page_type(page);
489 ret = 0; /* success */
491 out:
492 put_page(page);
493 return ret;
494 }
496 #ifdef MEMORY_GUARD
498 #define ALLOC_PT(_level) \
499 do { \
500 (_level) = (_level ## _pgentry_t *)heap_start; \
501 heap_start = (void *)((unsigned long)heap_start + PAGE_SIZE); \
502 clear_page(_level); \
503 } while ( 0 )
504 void *memguard_init(void *heap_start)
505 {
506 l1_pgentry_t *l1 = NULL;
507 l2_pgentry_t *l2 = NULL;
508 l3_pgentry_t *l3 = NULL;
509 l4_pgentry_t *l4 = &idle_pg_table[l4_table_offset(PAGE_OFFSET)];
510 unsigned long i, j;
512 /* Round the allocation pointer up to a page boundary. */
513 heap_start = (void *)(((unsigned long)heap_start + (PAGE_SIZE-1)) &
514 PAGE_MASK);
516 /* Memory guarding is incompatible with super pages. */
517 for ( i = 0; i < (xenheap_phys_end >> L2_PAGETABLE_SHIFT); i++ )
518 {
519 ALLOC_PT(l1);
520 for ( j = 0; j < ENTRIES_PER_L1_PAGETABLE; j++ )
521 l1[j] = mk_l1_pgentry((i << L2_PAGETABLE_SHIFT) |
522 (j << L1_PAGETABLE_SHIFT) |
523 __PAGE_HYPERVISOR);
524 if ( !((unsigned long)l2 & (PAGE_SIZE-1)) )
525 {
526 ALLOC_PT(l2);
527 if ( !((unsigned long)l3 & (PAGE_SIZE-1)) )
528 {
529 ALLOC_PT(l3);
530 *l4++ = mk_l4_pgentry(virt_to_phys(l3) | __PAGE_HYPERVISOR);
531 }
532 *l3++ = mk_l3_pgentry(virt_to_phys(l2) | __PAGE_HYPERVISOR);
533 }
534 *l2++ = mk_l2_pgentry(virt_to_phys(l1) | __PAGE_HYPERVISOR);
535 }
537 return heap_start;
538 }
540 static void __memguard_change_range(void *p, unsigned long l, int guard)
541 {
542 l1_pgentry_t *l1;
543 l2_pgentry_t *l2;
544 l3_pgentry_t *l3;
545 l4_pgentry_t *l4;
546 unsigned long _p = (unsigned long)p;
547 unsigned long _l = (unsigned long)l;
549 /* Ensure we are dealing with a page-aligned whole number of pages. */
550 ASSERT((_p&PAGE_MASK) != 0);
551 ASSERT((_l&PAGE_MASK) != 0);
552 ASSERT((_p&~PAGE_MASK) == 0);
553 ASSERT((_l&~PAGE_MASK) == 0);
555 while ( _l != 0 )
556 {
557 l4 = &idle_pg_table[l4_table_offset(_p)];
558 l3 = l4_pgentry_to_l3(*l4) + l3_table_offset(_p);
559 l2 = l3_pgentry_to_l2(*l3) + l2_table_offset(_p);
560 l1 = l2_pgentry_to_l1(*l2) + l1_table_offset(_p);
561 if ( guard )
562 *l1 = mk_l1_pgentry(l1_pgentry_val(*l1) & ~_PAGE_PRESENT);
563 else
564 *l1 = mk_l1_pgentry(l1_pgentry_val(*l1) | _PAGE_PRESENT);
565 _p += PAGE_SIZE;
566 _l -= PAGE_SIZE;
567 }
568 }
570 void memguard_guard_stack(void *p)
571 {
572 p = (void *)((unsigned long)p + PAGE_SIZE);
573 memguard_guard_range(p, 2 * PAGE_SIZE);
574 }
576 void memguard_guard_range(void *p, unsigned long l)
577 {
578 __memguard_change_range(p, l, 1);
579 local_flush_tlb();
580 }
582 void memguard_unguard_range(void *p, unsigned long l)
583 {
584 __memguard_change_range(p, l, 0);
585 }
587 #endif