debuggers.hg

view xen/arch/x86/x86_64/mm.c @ 20661:adb62ca21d31

memory hotadd 2/7: Destroy m2p table for hot-added memory when hot-add failed.

As when we destroy the m2p table, it should not be used, so we don't
need consider clean the head/tail mapping that may exits before hot-add.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Dec 11 08:54:37 2009 +0000 (2009-12-11)
parents b7cf749e14fc
children 0ca5a5f477be
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/numa.h>
26 #include <xen/guest_access.h>
27 #include <asm/current.h>
28 #include <asm/asm_defns.h>
29 #include <asm/page.h>
30 #include <asm/flushtlb.h>
31 #include <asm/fixmap.h>
32 #include <asm/hypercall.h>
33 #include <asm/msr.h>
34 #include <asm/setup.h>
35 #include <public/memory.h>
37 /* Parameters for PFN/MADDR compression. */
38 unsigned long __read_mostly max_pdx;
39 unsigned long __read_mostly pfn_pdx_bottom_mask = ~0UL;
40 unsigned long __read_mostly ma_va_bottom_mask = ~0UL;
41 unsigned long __read_mostly pfn_top_mask = 0;
42 unsigned long __read_mostly ma_top_mask = 0;
43 unsigned long __read_mostly pfn_hole_mask = 0;
44 unsigned int __read_mostly pfn_pdx_hole_shift = 0;
46 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
48 DEFINE_PER_CPU_READ_MOSTLY(void *, compat_arg_xlat);
50 /* Top-level master (and idle-domain) page directory. */
51 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
52 idle_pg_table[L4_PAGETABLE_ENTRIES];
54 /* Enough page directories to map bottom 4GB of the memory map. */
55 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
56 l3_identmap[L3_PAGETABLE_ENTRIES];
57 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
58 l2_identmap[4*L2_PAGETABLE_ENTRIES];
60 /* Enough page directories to map the Xen text and static data. */
61 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
62 l3_xenmap[L3_PAGETABLE_ENTRIES];
63 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
64 l2_xenmap[L2_PAGETABLE_ENTRIES];
66 int __mfn_valid(unsigned long mfn)
67 {
68 return likely(mfn < max_page) &&
69 likely(!(mfn & pfn_hole_mask)) &&
70 likely(test_bit(pfn_to_pdx(mfn) / PDX_GROUP_COUNT,
71 pdx_group_valid));
72 }
74 void *alloc_xen_pagetable(void)
75 {
76 unsigned long mfn;
78 if ( !early_boot )
79 {
80 struct page_info *pg = alloc_domheap_page(NULL, 0);
81 BUG_ON(pg == NULL);
82 return page_to_virt(pg);
83 }
85 mfn = alloc_boot_pages(1, 1);
86 return mfn_to_virt(mfn);
87 }
89 l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
90 {
91 l4_pgentry_t *pl4e;
93 pl4e = &idle_pg_table[l4_table_offset(v)];
94 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
95 {
96 l3_pgentry_t *pl3e = alloc_xen_pagetable();
97 clear_page(pl3e);
98 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
99 }
101 return l4e_to_l3e(*pl4e) + l3_table_offset(v);
102 }
104 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
105 {
106 l3_pgentry_t *pl3e;
108 pl3e = virt_to_xen_l3e(v);
109 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
110 {
111 l2_pgentry_t *pl2e = alloc_xen_pagetable();
112 clear_page(pl2e);
113 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
114 }
116 BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
117 return l3e_to_l2e(*pl3e) + l2_table_offset(v);
118 }
120 void *do_page_walk(struct vcpu *v, unsigned long addr)
121 {
122 unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
123 l4_pgentry_t l4e, *l4t;
124 l3_pgentry_t l3e, *l3t;
125 l2_pgentry_t l2e, *l2t;
126 l1_pgentry_t l1e, *l1t;
128 if ( is_hvm_vcpu(v) )
129 return NULL;
131 l4t = mfn_to_virt(mfn);
132 l4e = l4t[l4_table_offset(addr)];
133 mfn = l4e_get_pfn(l4e);
134 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
135 return NULL;
137 l3t = mfn_to_virt(mfn);
138 l3e = l3t[l3_table_offset(addr)];
139 mfn = l3e_get_pfn(l3e);
140 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
141 return NULL;
142 if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
143 return mfn_to_virt(mfn) + (addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
145 l2t = mfn_to_virt(mfn);
146 l2e = l2t[l2_table_offset(addr)];
147 mfn = l2e_get_pfn(l2e);
148 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
149 return NULL;
150 if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
151 return mfn_to_virt(mfn) + (addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
153 l1t = mfn_to_virt(mfn);
154 l1e = l1t[l1_table_offset(addr)];
155 mfn = l1e_get_pfn(l1e);
156 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
157 return NULL;
159 return mfn_to_virt(mfn) + (addr & ~PAGE_MASK);
160 }
162 void __init pfn_pdx_hole_setup(unsigned long mask)
163 {
164 unsigned int i, j, bottom_shift, hole_shift;
166 for ( hole_shift = bottom_shift = j = 0; ; )
167 {
168 i = find_next_zero_bit(&mask, BITS_PER_LONG, j);
169 j = find_next_bit(&mask, BITS_PER_LONG, i);
170 if ( j >= BITS_PER_LONG )
171 break;
172 if ( j - i > hole_shift )
173 {
174 hole_shift = j - i;
175 bottom_shift = i;
176 }
177 }
178 if ( !hole_shift )
179 return;
181 printk(KERN_INFO "PFN compression on bits %u...%u\n",
182 bottom_shift, bottom_shift + hole_shift - 1);
184 pfn_pdx_hole_shift = hole_shift;
185 pfn_pdx_bottom_mask = (1UL << bottom_shift) - 1;
186 ma_va_bottom_mask = (PAGE_SIZE << bottom_shift) - 1;
187 pfn_hole_mask = ((1UL << hole_shift) - 1) << bottom_shift;
188 pfn_top_mask = ~(pfn_pdx_bottom_mask | pfn_hole_mask);
189 ma_top_mask = pfn_top_mask << PAGE_SHIFT;
190 }
192 /*
193 * Allocate page table pages for m2p table
194 */
195 struct mem_hotadd_info
196 {
197 unsigned long spfn;
198 unsigned long epfn;
199 unsigned long cur;
200 };
202 int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info)
203 {
204 return (pfn < info->epfn && pfn >= info->spfn);
205 }
207 static unsigned long alloc_hotadd_mfn(struct mem_hotadd_info *info)
208 {
209 unsigned mfn;
211 ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) &&
212 info->cur >= info->spfn);
214 mfn = info->cur;
215 info->cur += (1UL << PAGETABLE_ORDER);
216 return mfn;
217 }
219 #define M2P_NO_MAPPED 0
220 #define M2P_2M_MAPPED 1
221 #define M2P_1G_MAPPED 2
222 static int m2p_mapped(unsigned long spfn)
223 {
224 unsigned long va;
225 l3_pgentry_t *l3_ro_mpt;
226 l2_pgentry_t *l2_ro_mpt;
228 va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping);
229 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
231 switch ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
232 (_PAGE_PRESENT |_PAGE_PSE))
233 {
234 case _PAGE_PSE|_PAGE_PRESENT:
235 return M2P_1G_MAPPED;
236 break;
237 /* Check for next level */
238 case _PAGE_PRESENT:
239 break;
240 default:
241 return M2P_NO_MAPPED;
242 break;
243 }
244 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
246 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
247 return M2P_2M_MAPPED;
249 return M2P_NO_MAPPED;
250 }
252 static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info)
253 {
254 unsigned long i, va, rwva, pt_pfn;
255 unsigned long smap = info->spfn, emap = info->spfn;
257 l3_pgentry_t *l3_ro_mpt;
258 l2_pgentry_t *l2_ro_mpt;
260 if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
261 return;
263 if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
264 emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
266 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
268 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]) & _PAGE_PRESENT);
270 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
272 for ( i = smap; i < emap; )
273 {
274 va = HIRO_COMPAT_MPT_VIRT_START +
275 i * sizeof(*compat_machine_to_phys_mapping);
276 rwva = RDWR_COMPAT_MPT_VIRT_START +
277 i * sizeof(*compat_machine_to_phys_mapping);
278 if ( l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT )
279 {
280 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
281 if ( hotadd_mem_valid(pt_pfn, info) )
282 {
283 destroy_xen_mappings(rwva, rwva +
284 (1UL << L2_PAGETABLE_SHIFT));
285 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
286 }
287 }
289 i += 1UL < (L2_PAGETABLE_SHIFT - 2);
290 }
292 return;
293 }
295 void destroy_m2p_mapping(struct mem_hotadd_info *info)
296 {
297 l3_pgentry_t *l3_ro_mpt;
298 unsigned long i, va, rwva;
299 unsigned long smap = info->spfn, emap = info->epfn;
301 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
303 /*
304 * No need to clean m2p structure existing before the hotplug
305 */
306 for (i = smap; i < emap;)
307 {
308 unsigned long pt_pfn;
309 l2_pgentry_t *l2_ro_mpt;
311 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
312 rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
314 /* 1G mapping should not be created by mem hotadd */
315 if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) ||
316 (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE))
317 {
318 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
319 (1UL << (L3_PAGETABLE_SHIFT - 3) );
320 continue;
321 }
323 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
324 if (!(l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT))
325 {
326 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
327 (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
328 continue;
329 }
331 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
332 if ( hotadd_mem_valid(pt_pfn, info) )
333 {
334 destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
336 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
337 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
338 }
339 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
340 (1UL << (L2_PAGETABLE_SHIFT - 3));
341 }
343 destroy_compat_m2p_mapping(info);
345 /* Brute-Force flush all TLB */
346 flush_tlb_all();
347 return;
348 }
350 /*
351 * Allocate and map the compatibility mode machine-to-phys table.
352 * spfn/epfn: the pfn ranges to be setup
353 * free_s/free_e: the pfn ranges that is free still
354 */
355 static int setup_compat_m2p_table(struct mem_hotadd_info *info)
356 {
357 unsigned long i, va, smap, emap, rwva, epfn = info->epfn;
358 unsigned int n, memflags;
359 l3_pgentry_t *l3_ro_mpt = NULL;
360 l2_pgentry_t *l2_ro_mpt = NULL;
361 struct page_info *l1_pg;
363 smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1));
365 /*
366 * Notice: For hot-added memory, only range below m2p_compat_vstart
367 * will be filled up (assuming memory is discontinous when booting).
368 */
369 if ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) )
370 return 0;
372 if (epfn > (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START))
373 epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
375 emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) &
376 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) );
378 va = HIRO_COMPAT_MPT_VIRT_START +
379 smap * sizeof(*compat_machine_to_phys_mapping);
380 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
382 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT);
384 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
386 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
387 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
388 sizeof(*compat_machine_to_phys_mapping))
389 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
390 sizeof(*compat_machine_to_phys_mapping));
392 for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) )
393 {
394 va = HIRO_COMPAT_MPT_VIRT_START +
395 i * sizeof(*compat_machine_to_phys_mapping);
397 rwva = RDWR_COMPAT_MPT_VIRT_START +
398 i * sizeof(*compat_machine_to_phys_mapping);
400 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
401 continue;
403 for ( n = 0; n < CNT; ++n)
404 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
405 break;
406 if ( n == CNT )
407 continue;
409 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
411 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
412 map_pages_to_xen(rwva,
413 page_to_mfn(l1_pg),
414 1UL << PAGETABLE_ORDER,
415 PAGE_HYPERVISOR);
416 memset((void *)rwva, 0x55, 1UL << L2_PAGETABLE_SHIFT);
417 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
418 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
419 }
420 #undef CNT
421 #undef MFN
422 return 0;
423 }
425 /*
426 * Allocate and map the machine-to-phys table.
427 * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already
428 */
429 int setup_m2p_table(struct mem_hotadd_info *info)
430 {
431 unsigned long i, va, smap, emap;
432 unsigned int n, memflags;
433 l2_pgentry_t *l2_ro_mpt = NULL;
434 l3_pgentry_t *l3_ro_mpt = NULL;
435 struct page_info *l1_pg, *l2_pg;
436 int ret = 0;
438 ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)])
439 & _PAGE_PRESENT);
440 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
442 smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)));
443 emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) &
444 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1));
446 va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping);
448 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
449 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
450 sizeof(*machine_to_phys_mapping))
452 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
453 sizeof(*machine_to_phys_mapping));
455 i = smap;
456 while ( i < emap )
457 {
458 switch ( m2p_mapped(i) )
459 {
460 case M2P_1G_MAPPED:
461 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
462 (1UL << (L3_PAGETABLE_SHIFT - 3));
463 continue;
464 case M2P_2M_MAPPED:
465 i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
466 (1UL << (L2_PAGETABLE_SHIFT - 3));
467 continue;
468 default:
469 break;
470 }
472 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
473 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
475 for ( n = 0; n < CNT; ++n)
476 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
477 break;
478 if ( n == CNT )
479 l1_pg = NULL;
480 else
481 {
482 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
483 map_pages_to_xen(
484 RDWR_MPT_VIRT_START + i * sizeof(unsigned long),
485 page_to_mfn(l1_pg),
486 1UL << PAGETABLE_ORDER,
487 PAGE_HYPERVISOR);
488 memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)),
489 0x55, 1UL << L2_PAGETABLE_SHIFT);
491 ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
492 _PAGE_PSE));
493 if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
494 _PAGE_PRESENT )
495 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) +
496 l2_table_offset(va);
497 else
498 {
499 l2_pg = alloc_domheap_page(NULL, memflags);
501 if (!l2_pg)
502 {
503 ret = -ENOMEM;
504 goto error;
505 }
507 l2_ro_mpt = page_to_virt(l2_pg);
508 clear_page(l2_ro_mpt);
509 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
510 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
511 l2_ro_mpt += l2_table_offset(va);
512 }
514 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
515 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg,
516 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
517 }
518 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
519 l2_ro_mpt = NULL;
520 i += ( 1UL << (L2_PAGETABLE_SHIFT - 3));
521 }
522 #undef CNT
523 #undef MFN
525 ret = setup_compat_m2p_table(info);
526 error:
527 return ret;
528 }
530 void __init paging_init(void)
531 {
532 unsigned long i, mpt_size, va;
533 unsigned int n, memflags;
534 l3_pgentry_t *l3_ro_mpt;
535 l2_pgentry_t *l2_ro_mpt = NULL;
536 struct page_info *l1_pg, *l2_pg, *l3_pg;
538 /* Create user-accessible L2 directory to map the MPT for guests. */
539 if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL )
540 goto nomem;
541 l3_ro_mpt = page_to_virt(l3_pg);
542 clear_page(l3_ro_mpt);
543 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
544 l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER));
546 /*
547 * Allocate and map the machine-to-phys table.
548 * This also ensures L3 is present for fixmaps.
549 */
550 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
551 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
552 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
553 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
554 sizeof(*machine_to_phys_mapping))
555 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
556 sizeof(*machine_to_phys_mapping));
557 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
558 {
559 BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
560 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
561 memflags = MEMF_node(phys_to_nid(i <<
562 (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
564 if ( cpu_has_page1gb &&
565 !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
566 (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) )
567 {
568 unsigned int k, holes;
570 for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
571 {
572 for ( n = 0; n < CNT; ++n)
573 if ( mfn_valid(MFN(i + k) + n * PDX_GROUP_COUNT) )
574 break;
575 if ( n == CNT )
576 ++holes;
577 }
578 if ( k == holes )
579 {
580 i += (1UL << PAGETABLE_ORDER) - 1;
581 continue;
582 }
583 if ( holes == 0 &&
584 (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
585 memflags)) != NULL )
586 {
587 map_pages_to_xen(
588 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
589 page_to_mfn(l1_pg),
590 1UL << (2 * PAGETABLE_ORDER),
591 PAGE_HYPERVISOR);
592 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
593 0x77, 1UL << L3_PAGETABLE_SHIFT);
595 ASSERT(!l2_table_offset(va));
596 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
597 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
598 l3e_from_page(l1_pg,
599 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
600 i += (1UL << PAGETABLE_ORDER) - 1;
601 continue;
602 }
603 }
605 for ( n = 0; n < CNT; ++n)
606 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
607 break;
608 if ( n == CNT )
609 l1_pg = NULL;
610 else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
611 memflags)) == NULL )
612 goto nomem;
613 else
614 {
615 map_pages_to_xen(
616 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
617 page_to_mfn(l1_pg),
618 1UL << PAGETABLE_ORDER,
619 PAGE_HYPERVISOR);
620 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
621 0x55, 1UL << L2_PAGETABLE_SHIFT);
622 }
623 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
624 {
625 if ( (l2_pg = alloc_domheap_page(NULL, memflags)) == NULL )
626 goto nomem;
627 l2_ro_mpt = page_to_virt(l2_pg);
628 clear_page(l2_ro_mpt);
629 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
630 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
631 ASSERT(!l2_table_offset(va));
632 }
633 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
634 if ( l1_pg )
635 l2e_write(l2_ro_mpt, l2e_from_page(
636 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
637 l2_ro_mpt++;
638 }
639 #undef CNT
640 #undef MFN
642 /* Create user-accessible L2 directory to map the MPT for compat guests. */
643 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
644 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
645 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
646 HIRO_COMPAT_MPT_VIRT_START)]);
647 if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
648 goto nomem;
649 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
650 clear_page(l2_ro_mpt);
651 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
652 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
653 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
654 /* Allocate and map the compatibility mode machine-to-phys table. */
655 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
656 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
657 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
658 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
659 if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
660 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
661 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
662 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
663 sizeof(*compat_machine_to_phys_mapping))
664 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
665 sizeof(*compat_machine_to_phys_mapping));
666 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ )
667 {
668 memflags = MEMF_node(phys_to_nid(i <<
669 (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
670 for ( n = 0; n < CNT; ++n)
671 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
672 break;
673 if ( n == CNT )
674 continue;
675 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
676 memflags)) == NULL )
677 goto nomem;
678 map_pages_to_xen(
679 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
680 page_to_mfn(l1_pg),
681 1UL << PAGETABLE_ORDER,
682 PAGE_HYPERVISOR);
683 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
684 (i << L2_PAGETABLE_SHIFT)),
685 0x55,
686 1UL << L2_PAGETABLE_SHIFT);
687 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
688 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
689 }
690 #undef CNT
691 #undef MFN
693 /* Set up linear page table mapping. */
694 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
695 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
696 return;
698 nomem:
699 panic("Not enough memory for m2p table\n");
700 }
702 void __init setup_idle_pagetable(void)
703 {
704 /* Install per-domain mappings for idle domain. */
705 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
706 l4e_from_page(
707 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
708 __PAGE_HYPERVISOR));
709 }
711 void __init zap_low_mappings(void)
712 {
713 BUG_ON(num_online_cpus() != 1);
715 /* Remove aliased mapping of first 1:1 PML4 entry. */
716 l4e_write(&idle_pg_table[0], l4e_empty());
717 flush_local(FLUSH_TLB_GLOBAL);
719 /* Replace with mapping of the boot trampoline only. */
720 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
721 0x10, __PAGE_HYPERVISOR);
722 }
724 int __cpuinit setup_compat_arg_xlat(unsigned int cpu, int node)
725 {
726 unsigned int order = get_order_from_bytes(COMPAT_ARG_XLAT_SIZE);
727 unsigned long sz = PAGE_SIZE << order;
728 unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
729 struct page_info *pg;
731 pg = alloc_domheap_pages(NULL, order, memflags);
732 if ( !pg )
733 return -ENOMEM;
735 for ( ; (sz -= PAGE_SIZE) >= COMPAT_ARG_XLAT_SIZE; ++pg )
736 free_domheap_page(pg);
738 per_cpu(compat_arg_xlat, cpu) = page_to_virt(pg);
740 return 0;
741 }
743 void __init subarch_init_memory(void)
744 {
745 unsigned long i, n, v, m2p_start_mfn;
746 l3_pgentry_t l3e;
747 l2_pgentry_t l2e;
749 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
750 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
751 /* M2P table is mappable read-only by privileged domains. */
752 for ( v = RDWR_MPT_VIRT_START;
753 v != RDWR_MPT_VIRT_END;
754 v += n << PAGE_SHIFT )
755 {
756 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
757 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
758 l3_table_offset(v)];
759 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
760 continue;
761 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
762 {
763 n = L1_PAGETABLE_ENTRIES;
764 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
765 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
766 continue;
767 m2p_start_mfn = l2e_get_pfn(l2e);
768 }
769 else
770 {
771 m2p_start_mfn = l3e_get_pfn(l3e);
772 }
774 for ( i = 0; i < n; i++ )
775 {
776 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
777 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
778 }
779 }
781 for ( v = RDWR_COMPAT_MPT_VIRT_START;
782 v != RDWR_COMPAT_MPT_VIRT_END;
783 v += 1 << L2_PAGETABLE_SHIFT )
784 {
785 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
786 l3_table_offset(v)];
787 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
788 continue;
789 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
790 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
791 continue;
792 m2p_start_mfn = l2e_get_pfn(l2e);
794 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
795 {
796 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
797 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
798 }
799 }
801 if ( setup_compat_arg_xlat(smp_processor_id(),
802 apicid_to_node[boot_cpu_physical_apicid]) )
803 panic("Could not setup argument translation area");
804 }
806 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
807 {
808 struct xen_machphys_mfn_list xmml;
809 l3_pgentry_t l3e;
810 l2_pgentry_t l2e;
811 unsigned long v;
812 xen_pfn_t mfn, last_mfn;
813 unsigned int i;
814 long rc = 0;
816 switch ( op )
817 {
818 case XENMEM_machphys_mfn_list:
819 if ( copy_from_guest(&xmml, arg, 1) )
820 return -EFAULT;
822 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
823 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
824 for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0;
825 (i != xmml.max_extents) &&
826 (v < (unsigned long)(machine_to_phys_mapping + max_page));
827 i++, v += 1UL << L2_PAGETABLE_SHIFT )
828 {
829 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
830 l3_table_offset(v)];
831 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
832 mfn = last_mfn;
833 else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
834 {
835 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
836 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
837 mfn = l2e_get_pfn(l2e);
838 else
839 mfn = last_mfn;
840 }
841 else
842 {
843 mfn = l3e_get_pfn(l3e)
844 + (l2_table_offset(v) << PAGETABLE_ORDER);
845 }
846 ASSERT(mfn);
847 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
848 return -EFAULT;
849 last_mfn = mfn;
850 }
852 xmml.nr_extents = i;
853 if ( copy_to_guest(arg, &xmml, 1) )
854 return -EFAULT;
856 break;
858 default:
859 rc = -ENOSYS;
860 break;
861 }
863 return rc;
864 }
866 long do_stack_switch(unsigned long ss, unsigned long esp)
867 {
868 fixup_guest_stack_selector(current->domain, ss);
869 current->arch.guest_context.kernel_ss = ss;
870 current->arch.guest_context.kernel_sp = esp;
871 return 0;
872 }
874 long do_set_segment_base(unsigned int which, unsigned long base)
875 {
876 struct vcpu *v = current;
877 long ret = 0;
879 switch ( which )
880 {
881 case SEGBASE_FS:
882 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
883 ret = -EFAULT;
884 else
885 v->arch.guest_context.fs_base = base;
886 break;
888 case SEGBASE_GS_USER:
889 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
890 ret = -EFAULT;
891 else
892 v->arch.guest_context.gs_base_user = base;
893 break;
895 case SEGBASE_GS_KERNEL:
896 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
897 ret = -EFAULT;
898 else
899 v->arch.guest_context.gs_base_kernel = base;
900 break;
902 case SEGBASE_GS_USER_SEL:
903 __asm__ __volatile__ (
904 " swapgs \n"
905 "1: movl %k0,%%gs \n"
906 " "safe_swapgs" \n"
907 ".section .fixup,\"ax\" \n"
908 "2: xorl %k0,%k0 \n"
909 " jmp 1b \n"
910 ".previous \n"
911 ".section __ex_table,\"a\"\n"
912 " .align 8 \n"
913 " .quad 1b,2b \n"
914 ".previous "
915 : : "r" (base&0xffff) );
916 break;
918 default:
919 ret = -EINVAL;
920 break;
921 }
923 return ret;
924 }
927 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
928 int check_descriptor(const struct domain *dom, struct desc_struct *d)
929 {
930 u32 a = d->a, b = d->b;
931 u16 cs;
932 unsigned int dpl;
934 /* A not-present descriptor will always fault, so is safe. */
935 if ( !(b & _SEGMENT_P) )
936 goto good;
938 /* Check and fix up the DPL. */
939 dpl = (b >> 13) & 3;
940 __fixup_guest_selector(dom, dpl);
941 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
943 /* All code and data segments are okay. No base/limit checking. */
944 if ( (b & _SEGMENT_S) )
945 {
946 if ( is_pv_32bit_domain(dom) )
947 {
948 unsigned long base, limit;
950 if ( b & _SEGMENT_L )
951 goto bad;
953 /*
954 * Older PAE Linux guests use segments which are limited to
955 * 0xf6800000. Extend these to allow access to the larger read-only
956 * M2P table available in 32on64 mode.
957 */
958 base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16);
960 limit = (b & 0xf0000) | (a & 0xffff);
961 limit++; /* We add one because limit is inclusive. */
963 if ( (b & _SEGMENT_G) )
964 limit <<= 12;
966 if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
967 {
968 a |= 0x0000ffff;
969 b |= 0x000f0000;
970 }
971 }
973 goto good;
974 }
976 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
977 if ( (b & _SEGMENT_TYPE) == 0x000 )
978 goto good;
980 /* Everything but a call gate is discarded here. */
981 if ( (b & _SEGMENT_TYPE) != 0xc00 )
982 goto bad;
984 /* Validate the target code selector. */
985 cs = a >> 16;
986 if ( !guest_gate_selector_okay(dom, cs) )
987 goto bad;
988 /*
989 * Force DPL to zero, causing a GP fault with its error code indicating
990 * the gate in use, allowing emulation. This is necessary because with
991 * native guests (kernel in ring 3) call gates cannot be used directly
992 * to transition from user to kernel mode (and whether a gate is used
993 * to enter the kernel can only be determined when the gate is being
994 * used), and with compat guests call gates cannot be used at all as
995 * there are only 64-bit ones.
996 * Store the original DPL in the selector's RPL field.
997 */
998 b &= ~_SEGMENT_DPL;
999 cs = (cs & ~3) | dpl;
1000 a = (a & 0xffffU) | (cs << 16);
1002 /* Reserved bits must be zero. */
1003 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
1004 goto bad;
1006 good:
1007 d->a = a;
1008 d->b = b;
1009 return 1;
1010 bad:
1011 return 0;
1014 void domain_set_alloc_bitsize(struct domain *d)
1016 if ( !is_pv_32on64_domain(d) ||
1017 (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
1018 d->arch.physaddr_bitsize > 0 )
1019 return;
1020 d->arch.physaddr_bitsize =
1021 /* 2^n entries can be contained in guest's p2m mapping space */
1022 fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
1023 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
1024 + PAGE_SHIFT;
1027 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
1029 if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
1030 return bits;
1031 return min(d->arch.physaddr_bitsize, bits);
1034 #include "compat/mm.c"
1036 /*
1037 * Local variables:
1038 * mode: C
1039 * c-set-style: "BSD"
1040 * c-basic-offset: 4
1041 * tab-width: 4
1042 * indent-tabs-mode: nil
1043 * End:
1044 */