debuggers.hg

view xen/arch/x86/x86_64/mm.c @ 16959:ed8ab1a36b09

x86-64: use 1GB pages in 1:1 mapping if available

At once adjust the 2/4Mb page handling slightly in a few places (to
match the newly added code):
- when re-creating a large page mapping after finding that all small
page mappings in the respective area are using identical flags and
suitable MFNs, the virtual address was already incremented pas the
area to be dealt with, which needs to be accounted for in the
invocation of flush_area() in that path
- don't or-in/and-out _PAGE_PSE on non-present pages
- when comparing flags, try minimse the number of l1f_to_lNf()/
lNf_to_l1f() instances used
- instead of skipping a single page when encountering a big page
mapping equalling to what a small page mapping would establish, skip
to the next larger page boundary

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jan 28 10:17:05 2008 +0000 (2008-01-28)
parents cd5e1e76d0bc
children 57febe0264e1
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/guest_access.h>
26 #include <asm/current.h>
27 #include <asm/asm_defns.h>
28 #include <asm/page.h>
29 #include <asm/flushtlb.h>
30 #include <asm/fixmap.h>
31 #include <asm/hypercall.h>
32 #include <asm/msr.h>
33 #include <public/memory.h>
35 #ifdef CONFIG_COMPAT
36 unsigned int m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
37 #endif
39 /* Top-level master (and idle-domain) page directory. */
40 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
41 idle_pg_table[L4_PAGETABLE_ENTRIES];
43 /* Enough page directories to map bottom 4GB of the memory map. */
44 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
45 l3_identmap[L3_PAGETABLE_ENTRIES];
46 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
47 l2_identmap[4*L2_PAGETABLE_ENTRIES];
49 /* Enough page directories to map the Xen text and static data. */
50 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
51 l3_xenmap[L3_PAGETABLE_ENTRIES];
52 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
53 l2_xenmap[L2_PAGETABLE_ENTRIES];
55 void *alloc_xen_pagetable(void)
56 {
57 extern int early_boot;
58 unsigned long mfn;
60 if ( !early_boot )
61 {
62 struct page_info *pg = alloc_domheap_page(NULL);
63 BUG_ON(pg == NULL);
64 return page_to_virt(pg);
65 }
67 mfn = alloc_boot_pages(1, 1);
68 BUG_ON(mfn == 0);
69 return mfn_to_virt(mfn);
70 }
72 l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
73 {
74 l4_pgentry_t *pl4e;
76 pl4e = &idle_pg_table[l4_table_offset(v)];
77 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
78 {
79 l3_pgentry_t *pl3e = alloc_xen_pagetable();
80 clear_page(pl3e);
81 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
82 }
84 return l4e_to_l3e(*pl4e) + l3_table_offset(v);
85 }
87 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
88 {
89 l3_pgentry_t *pl3e;
91 pl3e = virt_to_xen_l3e(v);
92 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
93 {
94 l2_pgentry_t *pl2e = alloc_xen_pagetable();
95 clear_page(pl2e);
96 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
97 }
99 BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
100 return l3e_to_l2e(*pl3e) + l2_table_offset(v);
101 }
103 void __init paging_init(void)
104 {
105 unsigned long i, mpt_size, va;
106 l3_pgentry_t *l3_ro_mpt;
107 l2_pgentry_t *l2_ro_mpt = NULL;
108 struct page_info *l1_pg, *l2_pg, *l3_pg;
110 /* Create user-accessible L2 directory to map the MPT for guests. */
111 if ( (l3_pg = alloc_domheap_page(NULL)) == NULL )
112 goto nomem;
113 l3_ro_mpt = page_to_virt(l3_pg);
114 clear_page(l3_ro_mpt);
115 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
116 l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER));
118 /*
119 * Allocate and map the machine-to-phys table.
120 * This also ensures L3 is present for fixmaps.
121 */
122 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
123 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
124 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
125 {
126 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
127 goto nomem;
128 map_pages_to_xen(
129 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
130 page_to_mfn(l1_pg),
131 1UL << PAGETABLE_ORDER,
132 PAGE_HYPERVISOR);
133 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55,
134 1UL << L2_PAGETABLE_SHIFT);
135 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
136 {
137 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
138 goto nomem;
139 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
140 l2_ro_mpt = page_to_virt(l2_pg);
141 clear_page(l2_ro_mpt);
142 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
143 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
144 l2_ro_mpt += l2_table_offset(va);
145 }
146 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
147 l2e_write(l2_ro_mpt, l2e_from_page(
148 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
149 l2_ro_mpt++;
150 }
152 /* Create user-accessible L2 directory to map the MPT for compat guests. */
153 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
154 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
155 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
156 HIRO_COMPAT_MPT_VIRT_START)]);
157 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
158 goto nomem;
159 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
160 clear_page(l2_ro_mpt);
161 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
162 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
163 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
164 /* Allocate and map the compatibility mode machine-to-phys table. */
165 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
166 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
167 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
168 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
169 if ( m2p_compat_vstart + mpt_size < MACH2PHYS_COMPAT_VIRT_END )
170 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
171 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
172 {
173 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
174 goto nomem;
175 map_pages_to_xen(
176 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
177 page_to_mfn(l1_pg),
178 1UL << PAGETABLE_ORDER,
179 PAGE_HYPERVISOR);
180 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
181 (i << L2_PAGETABLE_SHIFT)),
182 0x55,
183 1UL << L2_PAGETABLE_SHIFT);
184 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
185 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
186 l2_ro_mpt++;
187 }
189 /* Set up linear page table mapping. */
190 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
191 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
192 return;
194 nomem:
195 panic("Not enough memory for m2p table\n");
196 }
198 void __init setup_idle_pagetable(void)
199 {
200 /* Install per-domain mappings for idle domain. */
201 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
202 l4e_from_page(
203 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
204 __PAGE_HYPERVISOR));
205 }
207 void __init zap_low_mappings(void)
208 {
209 BUG_ON(num_online_cpus() != 1);
211 /* Remove aliased mapping of first 1:1 PML4 entry. */
212 l4e_write(&idle_pg_table[0], l4e_empty());
213 flush_local(FLUSH_TLB_GLOBAL);
215 /* Replace with mapping of the boot trampoline only. */
216 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
217 0x10, __PAGE_HYPERVISOR);
218 }
220 void __init subarch_init_memory(void)
221 {
222 unsigned long i, v, m2p_start_mfn;
223 l3_pgentry_t l3e;
224 l2_pgentry_t l2e;
226 /*
227 * We are rather picky about the layout of 'struct page_info'. The
228 * count_info and domain fields must be adjacent, as we perform atomic
229 * 64-bit operations on them.
230 */
231 BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) !=
232 (offsetof(struct page_info, count_info) + sizeof(u32)));
233 BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0);
234 BUILD_BUG_ON(sizeof(struct page_info) !=
235 (32 + BITS_TO_LONGS(NR_CPUS)*sizeof(long)));
237 /* M2P table is mappable read-only by privileged domains. */
238 for ( v = RDWR_MPT_VIRT_START;
239 v != RDWR_MPT_VIRT_END;
240 v += 1 << L2_PAGETABLE_SHIFT )
241 {
242 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
243 l3_table_offset(v)];
244 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
245 continue;
246 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
247 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
248 continue;
249 m2p_start_mfn = l2e_get_pfn(l2e);
251 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
252 {
253 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
254 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
255 }
256 }
258 for ( v = RDWR_COMPAT_MPT_VIRT_START;
259 v != RDWR_COMPAT_MPT_VIRT_END;
260 v += 1 << L2_PAGETABLE_SHIFT )
261 {
262 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
263 l3_table_offset(v)];
264 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
265 continue;
266 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
267 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
268 continue;
269 m2p_start_mfn = l2e_get_pfn(l2e);
271 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
272 {
273 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
274 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
275 }
276 }
277 }
279 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
280 {
281 struct xen_machphys_mfn_list xmml;
282 l3_pgentry_t l3e;
283 l2_pgentry_t l2e;
284 unsigned long v;
285 xen_pfn_t mfn;
286 unsigned int i;
287 long rc = 0;
289 switch ( op )
290 {
291 case XENMEM_machphys_mfn_list:
292 if ( copy_from_guest(&xmml, arg, 1) )
293 return -EFAULT;
295 for ( i = 0, v = RDWR_MPT_VIRT_START;
296 (i != xmml.max_extents) && (v != RDWR_MPT_VIRT_END);
297 i++, v += 1 << 21 )
298 {
299 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
300 l3_table_offset(v)];
301 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
302 break;
303 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
304 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
305 break;
306 mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
307 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
308 return -EFAULT;
309 }
311 xmml.nr_extents = i;
312 if ( copy_to_guest(arg, &xmml, 1) )
313 return -EFAULT;
315 break;
317 default:
318 rc = -ENOSYS;
319 break;
320 }
322 return rc;
323 }
325 long do_stack_switch(unsigned long ss, unsigned long esp)
326 {
327 fixup_guest_stack_selector(current->domain, ss);
328 current->arch.guest_context.kernel_ss = ss;
329 current->arch.guest_context.kernel_sp = esp;
330 return 0;
331 }
333 long do_set_segment_base(unsigned int which, unsigned long base)
334 {
335 struct vcpu *v = current;
336 long ret = 0;
338 switch ( which )
339 {
340 case SEGBASE_FS:
341 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
342 ret = -EFAULT;
343 else
344 v->arch.guest_context.fs_base = base;
345 break;
347 case SEGBASE_GS_USER:
348 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
349 ret = -EFAULT;
350 else
351 v->arch.guest_context.gs_base_user = base;
352 break;
354 case SEGBASE_GS_KERNEL:
355 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
356 ret = -EFAULT;
357 else
358 v->arch.guest_context.gs_base_kernel = base;
359 break;
361 case SEGBASE_GS_USER_SEL:
362 __asm__ __volatile__ (
363 " swapgs \n"
364 "1: movl %k0,%%gs \n"
365 " "safe_swapgs" \n"
366 ".section .fixup,\"ax\" \n"
367 "2: xorl %k0,%k0 \n"
368 " jmp 1b \n"
369 ".previous \n"
370 ".section __ex_table,\"a\"\n"
371 " .align 8 \n"
372 " .quad 1b,2b \n"
373 ".previous "
374 : : "r" (base&0xffff) );
375 break;
377 default:
378 ret = -EINVAL;
379 break;
380 }
382 return ret;
383 }
386 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
387 int check_descriptor(const struct domain *dom, struct desc_struct *d)
388 {
389 u32 a = d->a, b = d->b;
390 u16 cs;
391 unsigned int dpl;
393 /* A not-present descriptor will always fault, so is safe. */
394 if ( !(b & _SEGMENT_P) )
395 goto good;
397 /* Check and fix up the DPL. */
398 dpl = (b >> 13) & 3;
399 __fixup_guest_selector(dom, dpl);
400 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
402 /* All code and data segments are okay. No base/limit checking. */
403 if ( (b & _SEGMENT_S) )
404 {
405 if ( is_pv_32bit_domain(dom) && (b & _SEGMENT_L) )
406 goto bad;
407 goto good;
408 }
410 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
411 if ( (b & _SEGMENT_TYPE) == 0x000 )
412 goto good;
414 /* Everything but a call gate is discarded here. */
415 if ( (b & _SEGMENT_TYPE) != 0xc00 )
416 goto bad;
418 /* Validate the target code selector. */
419 cs = a >> 16;
420 if ( !guest_gate_selector_okay(dom, cs) )
421 goto bad;
422 /*
423 * Force DPL to zero, causing a GP fault with its error code indicating
424 * the gate in use, allowing emulation. This is necessary because with
425 * native guests (kernel in ring 3) call gates cannot be used directly
426 * to transition from user to kernel mode (and whether a gate is used
427 * to enter the kernel can only be determined when the gate is being
428 * used), and with compat guests call gates cannot be used at all as
429 * there are only 64-bit ones.
430 * Store the original DPL in the selector's RPL field.
431 */
432 b &= ~_SEGMENT_DPL;
433 cs = (cs & ~3) | dpl;
434 a = (a & 0xffffU) | (cs << 16);
436 /* Reserved bits must be zero. */
437 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
438 goto bad;
440 good:
441 d->a = a;
442 d->b = b;
443 return 1;
444 bad:
445 return 0;
446 }
448 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
449 {
450 if ( (d == NULL) || !is_pv_32on64_domain(d) )
451 return bits;
452 return min(d->arch.physaddr_bitsize, bits);
453 }
455 #include "compat/mm.c"
457 /*
458 * Local variables:
459 * mode: C
460 * c-set-style: "BSD"
461 * c-basic-offset: 4
462 * tab-width: 4
463 * indent-tabs-mode: nil
464 * End:
465 */