debuggers.hg

view xen/arch/x86/x86_64/mm.c @ 3686:5c112b235281

bitkeeper revision 1.1159.212.85 (42038b45EjUo-1JiSCHXW0Wav4TZGQ)

x86_64 progress: now entering ring 3. Need a hypercall (SYSCALL)
entry point, and some kind of DOM0 image to test against.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@scramble.cl.cam.ac.uk
date Fri Feb 04 14:48:37 2005 +0000 (2005-02-04)
parents 677cb76cff18
children 393483ae9f62 d93748c50893
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 #include <xen/config.h>
22 #include <xen/lib.h>
23 #include <xen/init.h>
24 #include <xen/mm.h>
25 #include <asm/page.h>
26 #include <asm/flushtlb.h>
27 #include <asm/fixmap.h>
28 #include <asm/domain_page.h>
30 void *safe_page_alloc(void)
31 {
32 extern int early_boot;
33 if ( early_boot )
34 return __va(alloc_boot_pages(PAGE_SIZE, PAGE_SIZE));
35 return (void *)alloc_xenheap_page();
36 }
38 /* Map physical byte range (@p, @p+@s) at virt address @v in pagetable @pt. */
39 int map_pages(
40 pagetable_t *pt,
41 unsigned long v,
42 unsigned long p,
43 unsigned long s,
44 unsigned long flags)
45 {
46 l4_pgentry_t *pl4e;
47 l3_pgentry_t *pl3e;
48 l2_pgentry_t *pl2e;
49 l1_pgentry_t *pl1e;
50 void *newpg;
52 while ( s != 0 )
53 {
54 pl4e = &pt[l4_table_offset(v)];
55 if ( !(l4_pgentry_val(*pl4e) & _PAGE_PRESENT) )
56 {
57 newpg = safe_page_alloc();
58 clear_page(newpg);
59 *pl4e = mk_l4_pgentry(__pa(newpg) | __PAGE_HYPERVISOR);
60 }
62 pl3e = l4_pgentry_to_l3(*pl4e) + l3_table_offset(v);
63 if ( !(l3_pgentry_val(*pl3e) & _PAGE_PRESENT) )
64 {
65 newpg = safe_page_alloc();
66 clear_page(newpg);
67 *pl3e = mk_l3_pgentry(__pa(newpg) | __PAGE_HYPERVISOR);
68 }
70 pl2e = l3_pgentry_to_l2(*pl3e) + l2_table_offset(v);
72 if ( ((s|v|p) & ((1<<L2_PAGETABLE_SHIFT)-1)) == 0 )
73 {
74 /* Super-page mapping. */
75 if ( (l2_pgentry_val(*pl2e) & _PAGE_PRESENT) )
76 __flush_tlb_pge();
77 *pl2e = mk_l2_pgentry(p|flags|_PAGE_PSE);
79 v += 1 << L2_PAGETABLE_SHIFT;
80 p += 1 << L2_PAGETABLE_SHIFT;
81 s -= 1 << L2_PAGETABLE_SHIFT;
82 }
83 else
84 {
85 /* Normal page mapping. */
86 if ( !(l2_pgentry_val(*pl2e) & _PAGE_PRESENT) )
87 {
88 newpg = safe_page_alloc();
89 clear_page(newpg);
90 *pl2e = mk_l2_pgentry(__pa(newpg) | __PAGE_HYPERVISOR);
91 }
92 pl1e = l2_pgentry_to_l1(*pl2e) + l1_table_offset(v);
93 if ( (l1_pgentry_val(*pl1e) & _PAGE_PRESENT) )
94 __flush_tlb_one(v);
95 *pl1e = mk_l1_pgentry(p|flags);
97 v += 1 << L1_PAGETABLE_SHIFT;
98 p += 1 << L1_PAGETABLE_SHIFT;
99 s -= 1 << L1_PAGETABLE_SHIFT;
100 }
101 }
103 return 0;
104 }
106 void __set_fixmap(
107 enum fixed_addresses idx, unsigned long p, unsigned long flags)
108 {
109 if ( unlikely(idx >= __end_of_fixed_addresses) )
110 BUG();
111 map_pages(idle_pg_table, fix_to_virt(idx), p, PAGE_SIZE, flags);
112 }
115 void __init paging_init(void)
116 {
117 void *newpt;
118 unsigned long i, p, max;
120 /* Map all of physical memory. */
121 max = ((max_page + ENTRIES_PER_L1_PAGETABLE - 1) &
122 ~(ENTRIES_PER_L1_PAGETABLE - 1)) << PAGE_SHIFT;
123 map_pages(idle_pg_table, PAGE_OFFSET, 0, max, PAGE_HYPERVISOR);
125 /*
126 * Allocate and map the machine-to-phys table.
127 * This also ensures L3 is present for ioremap().
128 */
129 for ( i = 0; i < max_page; i += ((1UL << L2_PAGETABLE_SHIFT) / 8) )
130 {
131 p = alloc_boot_pages(1UL << L2_PAGETABLE_SHIFT,
132 1UL << L2_PAGETABLE_SHIFT);
133 if ( p == 0 )
134 panic("Not enough memory for m2p table\n");
135 map_pages(idle_pg_table, RDWR_MPT_VIRT_START + i*8, p,
136 1UL << L2_PAGETABLE_SHIFT, PAGE_HYPERVISOR);
137 memset((void *)(RDWR_MPT_VIRT_START + i*8), 0x55,
138 1UL << L2_PAGETABLE_SHIFT);
139 }
141 /* Create read-only mapping of MPT for guest-OS use. */
142 newpt = (void *)alloc_xenheap_page();
143 clear_page(newpt);
144 idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)] =
145 mk_l4_pgentry((__pa(newpt) | __PAGE_HYPERVISOR | _PAGE_USER) &
146 ~_PAGE_RW);
147 /* Copy the L3 mappings from the RDWR_MPT area. */
148 p = l4_pgentry_val(idle_pg_table[l4_table_offset(RDWR_MPT_VIRT_START)]);
149 p &= PAGE_MASK;
150 p += l3_table_offset(RDWR_MPT_VIRT_START) * sizeof(l3_pgentry_t);
151 newpt = (void *)((unsigned long)newpt +
152 (l3_table_offset(RO_MPT_VIRT_START) *
153 sizeof(l3_pgentry_t)));
154 memcpy(newpt, __va(p),
155 (RDWR_MPT_VIRT_END - RDWR_MPT_VIRT_START) >> L3_PAGETABLE_SHIFT);
157 /* Set up linear page table mapping. */
158 idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)] =
159 mk_l4_pgentry(__pa(idle_pg_table) | __PAGE_HYPERVISOR);
160 }
162 void __init zap_low_mappings(void)
163 {
164 idle_pg_table[0] = mk_l4_pgentry(0);
165 flush_tlb_all_pge();
166 }
168 void subarch_init_memory(struct domain *dom_xen)
169 {
170 unsigned long i, v, m2p_start_mfn;
171 l3_pgentry_t l3e;
172 l2_pgentry_t l2e;
174 /*
175 * We are rather picky about the layout of 'struct pfn_info'. The
176 * count_info and domain fields must be adjacent, as we perform atomic
177 * 64-bit operations on them.
178 */
179 if ( (offsetof(struct pfn_info, u.inuse._domain) !=
180 (offsetof(struct pfn_info, count_info) + sizeof(u32))) )
181 {
182 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
183 offsetof(struct pfn_info, count_info),
184 offsetof(struct pfn_info, u.inuse._domain),
185 sizeof(struct pfn_info));
186 for ( ; ; ) ;
187 }
189 /* M2P table is mappable read-only by privileged domains. */
190 for ( v = RDWR_MPT_VIRT_START;
191 v != RDWR_MPT_VIRT_END;
192 v += 1 << L2_PAGETABLE_SHIFT )
193 {
194 l3e = l4_pgentry_to_l3(idle_pg_table[l4_table_offset(v)])[
195 l3_table_offset(v)];
196 if ( !(l3_pgentry_val(l3e) & _PAGE_PRESENT) )
197 continue;
198 l2e = l3_pgentry_to_l2(l3e)[l2_table_offset(v)];
199 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
200 continue;
201 m2p_start_mfn = l2_pgentry_to_pagenr(l2e);
203 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
204 {
205 frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
206 /* gdt to make sure it's only mapped read-only by non-privileged
207 domains. */
208 frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
209 page_set_owner(&frame_table[m2p_start_mfn+i], dom_xen);
210 }
211 }
212 }
214 /*
215 * Allows shooting down of borrowed page-table use on specific CPUs.
216 * Specifically, we borrow page tables when running the idle domain.
217 */
218 static void __synchronise_pagetables(void *mask)
219 {
220 struct exec_domain *ed = current;
221 if ( ((unsigned long)mask & (1 << ed->processor)) &&
222 is_idle_task(ed->domain) )
223 write_ptbase(&ed->mm);
224 }
225 void synchronise_pagetables(unsigned long cpu_mask)
226 {
227 __synchronise_pagetables((void *)cpu_mask);
228 smp_call_function(__synchronise_pagetables, (void *)cpu_mask, 1, 1);
229 }
231 long do_stack_switch(unsigned long ss, unsigned long esp)
232 {
233 if ( (ss & 3) != 3 )
234 return -EPERM;
235 current->thread.guestos_ss = ss;
236 current->thread.guestos_sp = esp;
237 return 0;
238 }
241 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
242 int check_descriptor(unsigned long *d)
243 {
244 unsigned long base, limit, a = d[0], b = d[1];
246 /* A not-present descriptor will always fault, so is safe. */
247 if ( !(b & _SEGMENT_P) )
248 goto good;
250 /*
251 * We don't allow a DPL of zero. There is no legitimate reason for
252 * specifying DPL==0, and it gets rather dangerous if we also accept call
253 * gates (consider a call gate pointing at another guestos descriptor with
254 * DPL 0 -- this would get the OS ring-0 privileges).
255 */
256 if ( (b & _SEGMENT_DPL) == 0 )
257 goto bad;
259 if ( !(b & _SEGMENT_S) )
260 {
261 /*
262 * System segment:
263 * 1. Don't allow interrupt or trap gates as they belong in the IDT.
264 * 2. Don't allow TSS descriptors or task gates as we don't
265 * virtualise x86 tasks.
266 * 3. Don't allow LDT descriptors because they're unnecessary and
267 * I'm uneasy about allowing an LDT page to contain LDT
268 * descriptors. In any case, Xen automatically creates the
269 * required descriptor when reloading the LDT register.
270 * 4. We allow call gates but they must not jump to a private segment.
271 */
273 /* Disallow everything but call gates. */
274 if ( (b & _SEGMENT_TYPE) != 0xc00 )
275 goto bad;
277 #if 0
278 /* Can't allow far jump to a Xen-private segment. */
279 if ( !VALID_CODESEL(a>>16) )
280 goto bad;
281 #endif
283 /* Reserved bits must be zero. */
284 if ( (b & 0xe0) != 0 )
285 goto bad;
287 /* No base/limit check is needed for a call gate. */
288 goto good;
289 }
291 /* Check that base is at least a page away from Xen-private area. */
292 base = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16);
293 if ( base >= (PAGE_OFFSET - PAGE_SIZE) )
294 goto bad;
296 /* Check and truncate the limit if necessary. */
297 limit = (b&0xf0000) | (a&0xffff);
298 limit++; /* We add one because limit is inclusive. */
299 if ( (b & _SEGMENT_G) )
300 limit <<= 12;
302 if ( (b & (_SEGMENT_CODE | _SEGMENT_EC)) == _SEGMENT_EC )
303 {
304 /*
305 * Grows-down limit check.
306 * NB. limit == 0xFFFFF provides no access (if G=1).
307 * limit == 0x00000 provides 4GB-4kB access (if G=1).
308 */
309 if ( (base + limit) > base )
310 {
311 limit = -(base & PAGE_MASK);
312 goto truncate;
313 }
314 }
315 else
316 {
317 /*
318 * Grows-up limit check.
319 * NB. limit == 0xFFFFF provides 4GB access (if G=1).
320 * limit == 0x00000 provides 4kB access (if G=1).
321 */
322 if ( ((base + limit) <= base) ||
323 ((base + limit) > PAGE_OFFSET) )
324 {
325 limit = PAGE_OFFSET - base;
326 truncate:
327 if ( !(b & _SEGMENT_G) )
328 goto bad; /* too dangerous; too hard to work out... */
329 limit = (limit >> 12) - 1;
330 d[0] &= ~0x0ffff; d[0] |= limit & 0x0ffff;
331 d[1] &= ~0xf0000; d[1] |= limit & 0xf0000;
332 }
333 }
335 good:
336 return 1;
337 bad:
338 return 0;
339 }
342 void destroy_gdt(struct exec_domain *ed)
343 {
344 int i;
345 unsigned long pfn;
347 for ( i = 0; i < 16; i++ )
348 {
349 if ( (pfn = l1_pgentry_to_pagenr(ed->mm.perdomain_ptes[i])) != 0 )
350 put_page_and_type(&frame_table[pfn]);
351 ed->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
352 }
353 }
356 long set_gdt(struct exec_domain *ed,
357 unsigned long *frames,
358 unsigned int entries)
359 {
360 struct domain *d = ed->domain;
361 /* NB. There are 512 8-byte entries per GDT page. */
362 int i = 0, nr_pages = (entries + 511) / 512;
363 struct desc_struct *vgdt;
364 unsigned long pfn;
366 /* Check the first page in the new GDT. */
367 if ( (pfn = frames[0]) >= max_page )
368 goto fail;
370 /* The first page is special because Xen owns a range of entries in it. */
371 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
372 {
373 /* GDT checks failed: try zapping the Xen reserved entries. */
374 if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
375 goto fail;
376 vgdt = map_domain_mem(pfn << PAGE_SHIFT);
377 memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
378 NR_RESERVED_GDT_ENTRIES*8);
379 unmap_domain_mem(vgdt);
380 put_page_and_type(&frame_table[pfn]);
382 /* Okay, we zapped the entries. Now try the GDT checks again. */
383 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
384 goto fail;
385 }
387 /* Check the remaining pages in the new GDT. */
388 for ( i = 1; i < nr_pages; i++ )
389 if ( ((pfn = frames[i]) >= max_page) ||
390 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
391 goto fail;
393 /* Copy reserved GDT entries to the new GDT. */
394 vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
395 memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY,
396 gdt_table + FIRST_RESERVED_GDT_ENTRY,
397 NR_RESERVED_GDT_ENTRIES*8);
398 unmap_domain_mem(vgdt);
400 /* Tear down the old GDT. */
401 destroy_gdt(ed);
403 /* Install the new GDT. */
404 for ( i = 0; i < nr_pages; i++ )
405 ed->mm.perdomain_ptes[i] =
406 mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
408 SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
409 SET_GDT_ENTRIES(ed, entries);
411 return 0;
413 fail:
414 while ( i-- > 0 )
415 put_page_and_type(&frame_table[frames[i]]);
416 return -EINVAL;
417 }
420 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
421 {
422 int nr_pages = (entries + 511) / 512;
423 unsigned long frames[16];
424 long ret;
426 if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) )
427 return -EINVAL;
429 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
430 return -EFAULT;
432 if ( (ret = set_gdt(current, frames, entries)) == 0 )
433 {
434 local_flush_tlb();
435 __asm__ __volatile__ ("lgdt %0" : "=m" (*current->mm.gdt));
436 }
438 return ret;
439 }
442 long do_update_descriptor(
443 unsigned long pa, unsigned long word1, unsigned long word2)
444 {
445 unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT, d[2];
446 struct pfn_info *page;
447 long ret = -EINVAL;
449 d[0] = word1;
450 d[1] = word2;
452 if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(d) )
453 return -EINVAL;
455 page = &frame_table[pfn];
456 if ( unlikely(!get_page(page, current->domain)) )
457 return -EINVAL;
459 /* Check if the given frame is in use in an unsafe context. */
460 switch ( page->u.inuse.type_info & PGT_type_mask )
461 {
462 case PGT_gdt_page:
463 /* Disallow updates of Xen-reserved descriptors in the current GDT. */
464 if ( (l1_pgentry_to_pagenr(current->mm.perdomain_ptes[0]) == pfn) &&
465 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
466 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
467 goto out;
468 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
469 goto out;
470 break;
471 case PGT_ldt_page:
472 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
473 goto out;
474 break;
475 default:
476 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
477 goto out;
478 break;
479 }
481 /* All is good so make the update. */
482 gdt_pent = map_domain_mem(pa);
483 memcpy(gdt_pent, d, 8);
484 unmap_domain_mem(gdt_pent);
486 put_page_type(page);
488 ret = 0; /* success */
490 out:
491 put_page(page);
492 return ret;
493 }
495 #ifdef MEMORY_GUARD
497 #if 1
499 void *memguard_init(void *heap_start) { return heap_start; }
500 void memguard_guard_range(void *p, unsigned long l) {}
501 void memguard_unguard_range(void *p, unsigned long l) {}
503 #else
505 void *memguard_init(void *heap_start)
506 {
507 l1_pgentry_t *l1;
508 int i, j;
510 /* Round the allocation pointer up to a page boundary. */
511 heap_start = (void *)(((unsigned long)heap_start + (PAGE_SIZE-1)) &
512 PAGE_MASK);
514 /* Memory guarding is incompatible with super pages. */
515 for ( i = 0; i < (xenheap_phys_end >> L2_PAGETABLE_SHIFT); i++ )
516 {
517 l1 = (l1_pgentry_t *)heap_start;
518 heap_start = (void *)((unsigned long)heap_start + PAGE_SIZE);
519 for ( j = 0; j < ENTRIES_PER_L1_PAGETABLE; j++ )
520 l1[j] = mk_l1_pgentry((i << L2_PAGETABLE_SHIFT) |
521 (j << L1_PAGETABLE_SHIFT) |
522 __PAGE_HYPERVISOR);
523 idle_pg_table[i] = idle_pg_table[i + l2_table_offset(PAGE_OFFSET)] =
524 mk_l2_pgentry(virt_to_phys(l1) | __PAGE_HYPERVISOR);
525 }
527 return heap_start;
528 }
530 static void __memguard_change_range(void *p, unsigned long l, int guard)
531 {
532 l1_pgentry_t *l1;
533 l2_pgentry_t *l2;
534 unsigned long _p = (unsigned long)p;
535 unsigned long _l = (unsigned long)l;
537 /* Ensure we are dealing with a page-aligned whole number of pages. */
538 ASSERT((_p&PAGE_MASK) != 0);
539 ASSERT((_l&PAGE_MASK) != 0);
540 ASSERT((_p&~PAGE_MASK) == 0);
541 ASSERT((_l&~PAGE_MASK) == 0);
543 while ( _l != 0 )
544 {
545 l2 = &idle_pg_table[l2_table_offset(_p)];
546 l1 = l2_pgentry_to_l1(*l2) + l1_table_offset(_p);
547 if ( guard )
548 *l1 = mk_l1_pgentry(l1_pgentry_val(*l1) & ~_PAGE_PRESENT);
549 else
550 *l1 = mk_l1_pgentry(l1_pgentry_val(*l1) | _PAGE_PRESENT);
551 _p += PAGE_SIZE;
552 _l -= PAGE_SIZE;
553 }
554 }
556 void memguard_guard_range(void *p, unsigned long l)
557 {
558 __memguard_change_range(p, l, 1);
559 local_flush_tlb();
560 }
562 void memguard_unguard_range(void *p, unsigned long l)
563 {
564 __memguard_change_range(p, l, 0);
565 }
567 #endif
569 #endif