debuggers.hg

view xen/arch/x86/x86_64/mm.c @ 16586:cd5e1e76d0bc

32-on-64: Fix domain address-size clamping, implement
copy-on-grant-transfer, and eliminate 166GB memory limit for x86/64
Xen.
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Dec 06 13:39:19 2007 +0000 (2007-12-06)
parents 3fe75ef9ca93
children ed8ab1a36b09
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/guest_access.h>
26 #include <asm/current.h>
27 #include <asm/asm_defns.h>
28 #include <asm/page.h>
29 #include <asm/flushtlb.h>
30 #include <asm/fixmap.h>
31 #include <asm/hypercall.h>
32 #include <asm/msr.h>
33 #include <public/memory.h>
35 #ifdef CONFIG_COMPAT
36 unsigned int m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
37 #endif
39 /* Top-level master (and idle-domain) page directory. */
40 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
41 idle_pg_table[L4_PAGETABLE_ENTRIES];
43 /* Enough page directories to map bottom 4GB of the memory map. */
44 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
45 l3_identmap[L3_PAGETABLE_ENTRIES];
46 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
47 l2_identmap[4*L2_PAGETABLE_ENTRIES];
49 /* Enough page directories to map the Xen text and static data. */
50 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
51 l3_xenmap[L3_PAGETABLE_ENTRIES];
52 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
53 l2_xenmap[L2_PAGETABLE_ENTRIES];
55 void *alloc_xen_pagetable(void)
56 {
57 extern int early_boot;
58 unsigned long mfn;
60 if ( !early_boot )
61 {
62 struct page_info *pg = alloc_domheap_page(NULL);
63 BUG_ON(pg == NULL);
64 return page_to_virt(pg);
65 }
67 mfn = alloc_boot_pages(1, 1);
68 BUG_ON(mfn == 0);
69 return mfn_to_virt(mfn);
70 }
72 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
73 {
74 l4_pgentry_t *pl4e;
75 l3_pgentry_t *pl3e;
76 l2_pgentry_t *pl2e;
78 pl4e = &idle_pg_table[l4_table_offset(v)];
79 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
80 {
81 pl3e = alloc_xen_pagetable();
82 clear_page(pl3e);
83 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
84 }
86 pl3e = l4e_to_l3e(*pl4e) + l3_table_offset(v);
87 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
88 {
89 pl2e = alloc_xen_pagetable();
90 clear_page(pl2e);
91 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
92 }
94 pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(v);
95 return pl2e;
96 }
98 void __init paging_init(void)
99 {
100 unsigned long i, mpt_size, va;
101 l3_pgentry_t *l3_ro_mpt;
102 l2_pgentry_t *l2_ro_mpt = NULL;
103 struct page_info *l1_pg, *l2_pg, *l3_pg;
105 /* Create user-accessible L2 directory to map the MPT for guests. */
106 if ( (l3_pg = alloc_domheap_page(NULL)) == NULL )
107 goto nomem;
108 l3_ro_mpt = page_to_virt(l3_pg);
109 clear_page(l3_ro_mpt);
110 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
111 l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER));
113 /*
114 * Allocate and map the machine-to-phys table.
115 * This also ensures L3 is present for fixmaps.
116 */
117 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
118 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
119 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
120 {
121 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
122 goto nomem;
123 map_pages_to_xen(
124 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
125 page_to_mfn(l1_pg),
126 1UL << PAGETABLE_ORDER,
127 PAGE_HYPERVISOR);
128 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55,
129 1UL << L2_PAGETABLE_SHIFT);
130 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
131 {
132 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
133 goto nomem;
134 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
135 l2_ro_mpt = page_to_virt(l2_pg);
136 clear_page(l2_ro_mpt);
137 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
138 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
139 l2_ro_mpt += l2_table_offset(va);
140 }
141 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
142 l2e_write(l2_ro_mpt, l2e_from_page(
143 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
144 l2_ro_mpt++;
145 }
147 /* Create user-accessible L2 directory to map the MPT for compat guests. */
148 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
149 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
150 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
151 HIRO_COMPAT_MPT_VIRT_START)]);
152 if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
153 goto nomem;
154 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
155 clear_page(l2_ro_mpt);
156 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
157 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
158 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
159 /* Allocate and map the compatibility mode machine-to-phys table. */
160 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
161 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
162 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
163 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
164 if ( m2p_compat_vstart + mpt_size < MACH2PHYS_COMPAT_VIRT_END )
165 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
166 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
167 {
168 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
169 goto nomem;
170 map_pages_to_xen(
171 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
172 page_to_mfn(l1_pg),
173 1UL << PAGETABLE_ORDER,
174 PAGE_HYPERVISOR);
175 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
176 (i << L2_PAGETABLE_SHIFT)),
177 0x55,
178 1UL << L2_PAGETABLE_SHIFT);
179 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
180 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
181 l2_ro_mpt++;
182 }
184 /* Set up linear page table mapping. */
185 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
186 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
187 return;
189 nomem:
190 panic("Not enough memory for m2p table\n");
191 }
193 void __init setup_idle_pagetable(void)
194 {
195 /* Install per-domain mappings for idle domain. */
196 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
197 l4e_from_page(
198 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
199 __PAGE_HYPERVISOR));
200 }
202 void __init zap_low_mappings(void)
203 {
204 BUG_ON(num_online_cpus() != 1);
206 /* Remove aliased mapping of first 1:1 PML4 entry. */
207 l4e_write(&idle_pg_table[0], l4e_empty());
208 flush_local(FLUSH_TLB_GLOBAL);
210 /* Replace with mapping of the boot trampoline only. */
211 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
212 0x10, __PAGE_HYPERVISOR);
213 }
215 void __init subarch_init_memory(void)
216 {
217 unsigned long i, v, m2p_start_mfn;
218 l3_pgentry_t l3e;
219 l2_pgentry_t l2e;
221 /*
222 * We are rather picky about the layout of 'struct page_info'. The
223 * count_info and domain fields must be adjacent, as we perform atomic
224 * 64-bit operations on them.
225 */
226 BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) !=
227 (offsetof(struct page_info, count_info) + sizeof(u32)));
228 BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0);
229 BUILD_BUG_ON(sizeof(struct page_info) !=
230 (32 + BITS_TO_LONGS(NR_CPUS)*sizeof(long)));
232 /* M2P table is mappable read-only by privileged domains. */
233 for ( v = RDWR_MPT_VIRT_START;
234 v != RDWR_MPT_VIRT_END;
235 v += 1 << L2_PAGETABLE_SHIFT )
236 {
237 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
238 l3_table_offset(v)];
239 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
240 continue;
241 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
242 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
243 continue;
244 m2p_start_mfn = l2e_get_pfn(l2e);
246 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
247 {
248 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
249 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
250 }
251 }
253 for ( v = RDWR_COMPAT_MPT_VIRT_START;
254 v != RDWR_COMPAT_MPT_VIRT_END;
255 v += 1 << L2_PAGETABLE_SHIFT )
256 {
257 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
258 l3_table_offset(v)];
259 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
260 continue;
261 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
262 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
263 continue;
264 m2p_start_mfn = l2e_get_pfn(l2e);
266 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
267 {
268 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
269 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
270 }
271 }
272 }
274 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
275 {
276 struct xen_machphys_mfn_list xmml;
277 l3_pgentry_t l3e;
278 l2_pgentry_t l2e;
279 unsigned long v;
280 xen_pfn_t mfn;
281 unsigned int i;
282 long rc = 0;
284 switch ( op )
285 {
286 case XENMEM_machphys_mfn_list:
287 if ( copy_from_guest(&xmml, arg, 1) )
288 return -EFAULT;
290 for ( i = 0, v = RDWR_MPT_VIRT_START;
291 (i != xmml.max_extents) && (v != RDWR_MPT_VIRT_END);
292 i++, v += 1 << 21 )
293 {
294 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
295 l3_table_offset(v)];
296 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
297 break;
298 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
299 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
300 break;
301 mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
302 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
303 return -EFAULT;
304 }
306 xmml.nr_extents = i;
307 if ( copy_to_guest(arg, &xmml, 1) )
308 return -EFAULT;
310 break;
312 default:
313 rc = -ENOSYS;
314 break;
315 }
317 return rc;
318 }
320 long do_stack_switch(unsigned long ss, unsigned long esp)
321 {
322 fixup_guest_stack_selector(current->domain, ss);
323 current->arch.guest_context.kernel_ss = ss;
324 current->arch.guest_context.kernel_sp = esp;
325 return 0;
326 }
328 long do_set_segment_base(unsigned int which, unsigned long base)
329 {
330 struct vcpu *v = current;
331 long ret = 0;
333 switch ( which )
334 {
335 case SEGBASE_FS:
336 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
337 ret = -EFAULT;
338 else
339 v->arch.guest_context.fs_base = base;
340 break;
342 case SEGBASE_GS_USER:
343 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
344 ret = -EFAULT;
345 else
346 v->arch.guest_context.gs_base_user = base;
347 break;
349 case SEGBASE_GS_KERNEL:
350 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
351 ret = -EFAULT;
352 else
353 v->arch.guest_context.gs_base_kernel = base;
354 break;
356 case SEGBASE_GS_USER_SEL:
357 __asm__ __volatile__ (
358 " swapgs \n"
359 "1: movl %k0,%%gs \n"
360 " "safe_swapgs" \n"
361 ".section .fixup,\"ax\" \n"
362 "2: xorl %k0,%k0 \n"
363 " jmp 1b \n"
364 ".previous \n"
365 ".section __ex_table,\"a\"\n"
366 " .align 8 \n"
367 " .quad 1b,2b \n"
368 ".previous "
369 : : "r" (base&0xffff) );
370 break;
372 default:
373 ret = -EINVAL;
374 break;
375 }
377 return ret;
378 }
381 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
382 int check_descriptor(const struct domain *dom, struct desc_struct *d)
383 {
384 u32 a = d->a, b = d->b;
385 u16 cs;
386 unsigned int dpl;
388 /* A not-present descriptor will always fault, so is safe. */
389 if ( !(b & _SEGMENT_P) )
390 goto good;
392 /* Check and fix up the DPL. */
393 dpl = (b >> 13) & 3;
394 __fixup_guest_selector(dom, dpl);
395 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
397 /* All code and data segments are okay. No base/limit checking. */
398 if ( (b & _SEGMENT_S) )
399 {
400 if ( is_pv_32bit_domain(dom) && (b & _SEGMENT_L) )
401 goto bad;
402 goto good;
403 }
405 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
406 if ( (b & _SEGMENT_TYPE) == 0x000 )
407 goto good;
409 /* Everything but a call gate is discarded here. */
410 if ( (b & _SEGMENT_TYPE) != 0xc00 )
411 goto bad;
413 /* Validate the target code selector. */
414 cs = a >> 16;
415 if ( !guest_gate_selector_okay(dom, cs) )
416 goto bad;
417 /*
418 * Force DPL to zero, causing a GP fault with its error code indicating
419 * the gate in use, allowing emulation. This is necessary because with
420 * native guests (kernel in ring 3) call gates cannot be used directly
421 * to transition from user to kernel mode (and whether a gate is used
422 * to enter the kernel can only be determined when the gate is being
423 * used), and with compat guests call gates cannot be used at all as
424 * there are only 64-bit ones.
425 * Store the original DPL in the selector's RPL field.
426 */
427 b &= ~_SEGMENT_DPL;
428 cs = (cs & ~3) | dpl;
429 a = (a & 0xffffU) | (cs << 16);
431 /* Reserved bits must be zero. */
432 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
433 goto bad;
435 good:
436 d->a = a;
437 d->b = b;
438 return 1;
439 bad:
440 return 0;
441 }
443 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
444 {
445 if ( (d == NULL) || !is_pv_32on64_domain(d) )
446 return bits;
447 return min(d->arch.physaddr_bitsize, bits);
448 }
450 #include "compat/mm.c"
452 /*
453 * Local variables:
454 * mode: C
455 * c-set-style: "BSD"
456 * c-basic-offset: 4
457 * tab-width: 4
458 * indent-tabs-mode: nil
459 * End:
460 */