debuggers.hg

view xen/arch/x86/x86_64/mm.c @ 20665:7d7e221370ea

memory hotadd 6/7: Allocate L3 table for whole direct maping range if
memory hotplug is supported.

Hot-added memory may need a new L4 entry for 1:1 mapping. This patch
setup all L4 entry for 1:1 mapping if memory hotadd is needed, so that
we don't need sync the guest page table in page fault handler.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Dec 11 08:57:30 2009 +0000 (2009-12-11)
parents 611f49efe955
children a50c1cbf08ec
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/numa.h>
26 #include <xen/guest_access.h>
27 #include <asm/current.h>
28 #include <asm/asm_defns.h>
29 #include <asm/page.h>
30 #include <asm/flushtlb.h>
31 #include <asm/fixmap.h>
32 #include <asm/hypercall.h>
33 #include <asm/msr.h>
34 #include <asm/setup.h>
35 #include <public/memory.h>
37 /* Parameters for PFN/MADDR compression. */
38 unsigned long __read_mostly max_pdx;
39 unsigned long __read_mostly pfn_pdx_bottom_mask = ~0UL;
40 unsigned long __read_mostly ma_va_bottom_mask = ~0UL;
41 unsigned long __read_mostly pfn_top_mask = 0;
42 unsigned long __read_mostly ma_top_mask = 0;
43 unsigned long __read_mostly pfn_hole_mask = 0;
44 unsigned int __read_mostly pfn_pdx_hole_shift = 0;
46 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
48 DEFINE_PER_CPU_READ_MOSTLY(void *, compat_arg_xlat);
50 /* Top-level master (and idle-domain) page directory. */
51 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
52 idle_pg_table[L4_PAGETABLE_ENTRIES];
54 /* Enough page directories to map bottom 4GB of the memory map. */
55 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
56 l3_identmap[L3_PAGETABLE_ENTRIES];
57 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
58 l2_identmap[4*L2_PAGETABLE_ENTRIES];
60 /* Enough page directories to map the Xen text and static data. */
61 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
62 l3_xenmap[L3_PAGETABLE_ENTRIES];
63 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
64 l2_xenmap[L2_PAGETABLE_ENTRIES];
66 int __mfn_valid(unsigned long mfn)
67 {
68 return likely(mfn < max_page) &&
69 likely(!(mfn & pfn_hole_mask)) &&
70 likely(test_bit(pfn_to_pdx(mfn) / PDX_GROUP_COUNT,
71 pdx_group_valid));
72 }
74 void *alloc_xen_pagetable(void)
75 {
76 unsigned long mfn;
78 if ( !early_boot )
79 {
80 struct page_info *pg = alloc_domheap_page(NULL, 0);
81 BUG_ON(pg == NULL);
82 return page_to_virt(pg);
83 }
85 mfn = alloc_boot_pages(1, 1);
86 return mfn_to_virt(mfn);
87 }
89 l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
90 {
91 l4_pgentry_t *pl4e;
93 pl4e = &idle_pg_table[l4_table_offset(v)];
94 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
95 {
96 l3_pgentry_t *pl3e = alloc_xen_pagetable();
97 clear_page(pl3e);
98 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
99 }
101 return l4e_to_l3e(*pl4e) + l3_table_offset(v);
102 }
104 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
105 {
106 l3_pgentry_t *pl3e;
108 pl3e = virt_to_xen_l3e(v);
109 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
110 {
111 l2_pgentry_t *pl2e = alloc_xen_pagetable();
112 clear_page(pl2e);
113 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
114 }
116 BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
117 return l3e_to_l2e(*pl3e) + l2_table_offset(v);
118 }
120 void *do_page_walk(struct vcpu *v, unsigned long addr)
121 {
122 unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
123 l4_pgentry_t l4e, *l4t;
124 l3_pgentry_t l3e, *l3t;
125 l2_pgentry_t l2e, *l2t;
126 l1_pgentry_t l1e, *l1t;
128 if ( is_hvm_vcpu(v) )
129 return NULL;
131 l4t = mfn_to_virt(mfn);
132 l4e = l4t[l4_table_offset(addr)];
133 mfn = l4e_get_pfn(l4e);
134 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
135 return NULL;
137 l3t = mfn_to_virt(mfn);
138 l3e = l3t[l3_table_offset(addr)];
139 mfn = l3e_get_pfn(l3e);
140 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
141 return NULL;
142 if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
143 return mfn_to_virt(mfn) + (addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
145 l2t = mfn_to_virt(mfn);
146 l2e = l2t[l2_table_offset(addr)];
147 mfn = l2e_get_pfn(l2e);
148 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
149 return NULL;
150 if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
151 return mfn_to_virt(mfn) + (addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
153 l1t = mfn_to_virt(mfn);
154 l1e = l1t[l1_table_offset(addr)];
155 mfn = l1e_get_pfn(l1e);
156 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
157 return NULL;
159 return mfn_to_virt(mfn) + (addr & ~PAGE_MASK);
160 }
162 void __init pfn_pdx_hole_setup(unsigned long mask)
163 {
164 unsigned int i, j, bottom_shift, hole_shift;
166 for ( hole_shift = bottom_shift = j = 0; ; )
167 {
168 i = find_next_zero_bit(&mask, BITS_PER_LONG, j);
169 j = find_next_bit(&mask, BITS_PER_LONG, i);
170 if ( j >= BITS_PER_LONG )
171 break;
172 if ( j - i > hole_shift )
173 {
174 hole_shift = j - i;
175 bottom_shift = i;
176 }
177 }
178 if ( !hole_shift )
179 return;
181 printk(KERN_INFO "PFN compression on bits %u...%u\n",
182 bottom_shift, bottom_shift + hole_shift - 1);
184 pfn_pdx_hole_shift = hole_shift;
185 pfn_pdx_bottom_mask = (1UL << bottom_shift) - 1;
186 ma_va_bottom_mask = (PAGE_SIZE << bottom_shift) - 1;
187 pfn_hole_mask = ((1UL << hole_shift) - 1) << bottom_shift;
188 pfn_top_mask = ~(pfn_pdx_bottom_mask | pfn_hole_mask);
189 ma_top_mask = pfn_top_mask << PAGE_SHIFT;
190 }
192 /*
193 * Allocate page table pages for m2p table
194 */
195 struct mem_hotadd_info
196 {
197 unsigned long spfn;
198 unsigned long epfn;
199 unsigned long cur;
200 };
202 int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info)
203 {
204 return (pfn < info->epfn && pfn >= info->spfn);
205 }
207 static unsigned long alloc_hotadd_mfn(struct mem_hotadd_info *info)
208 {
209 unsigned mfn;
211 ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) &&
212 info->cur >= info->spfn);
214 mfn = info->cur;
215 info->cur += (1UL << PAGETABLE_ORDER);
216 return mfn;
217 }
219 #define M2P_NO_MAPPED 0
220 #define M2P_2M_MAPPED 1
221 #define M2P_1G_MAPPED 2
222 static int m2p_mapped(unsigned long spfn)
223 {
224 unsigned long va;
225 l3_pgentry_t *l3_ro_mpt;
226 l2_pgentry_t *l2_ro_mpt;
228 va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping);
229 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
231 switch ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
232 (_PAGE_PRESENT |_PAGE_PSE))
233 {
234 case _PAGE_PSE|_PAGE_PRESENT:
235 return M2P_1G_MAPPED;
236 break;
237 /* Check for next level */
238 case _PAGE_PRESENT:
239 break;
240 default:
241 return M2P_NO_MAPPED;
242 break;
243 }
244 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
246 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
247 return M2P_2M_MAPPED;
249 return M2P_NO_MAPPED;
250 }
252 int share_hotadd_m2p_table(struct mem_hotadd_info *info)
253 {
254 unsigned long i, n, v, m2p_start_mfn = 0;
255 l3_pgentry_t l3e;
256 l2_pgentry_t l2e;
258 /* M2P table is mappable read-only by privileged domains. */
259 for ( v = RDWR_MPT_VIRT_START;
260 v != RDWR_MPT_VIRT_END;
261 v += n << PAGE_SHIFT )
262 {
263 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
264 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
265 l3_table_offset(v)];
266 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
267 continue;
268 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
269 {
270 n = L1_PAGETABLE_ENTRIES;
271 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
272 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
273 continue;
274 m2p_start_mfn = l2e_get_pfn(l2e);
275 }
276 else
277 continue;
279 for ( i = 0; i < n; i++ )
280 {
281 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
282 if (hotadd_mem_valid(m2p_start_mfn + i, info))
283 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
284 }
285 }
287 for ( v = RDWR_COMPAT_MPT_VIRT_START;
288 v != RDWR_COMPAT_MPT_VIRT_END;
289 v += 1 << L2_PAGETABLE_SHIFT )
290 {
291 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
292 l3_table_offset(v)];
293 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
294 continue;
295 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
296 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
297 continue;
298 m2p_start_mfn = l2e_get_pfn(l2e);
300 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
301 {
302 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
303 if (hotadd_mem_valid(m2p_start_mfn + i, info))
304 {
305 printk("now share page %lx\n", m2p_start_mfn + i);
306 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
307 }
308 }
309 }
310 return 0;
311 }
313 static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info)
314 {
315 unsigned long i, va, rwva, pt_pfn;
316 unsigned long smap = info->spfn, emap = info->spfn;
318 l3_pgentry_t *l3_ro_mpt;
319 l2_pgentry_t *l2_ro_mpt;
321 if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
322 return;
324 if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
325 emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
327 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
329 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]) & _PAGE_PRESENT);
331 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
333 for ( i = smap; i < emap; )
334 {
335 va = HIRO_COMPAT_MPT_VIRT_START +
336 i * sizeof(*compat_machine_to_phys_mapping);
337 rwva = RDWR_COMPAT_MPT_VIRT_START +
338 i * sizeof(*compat_machine_to_phys_mapping);
339 if ( l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT )
340 {
341 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
342 if ( hotadd_mem_valid(pt_pfn, info) )
343 {
344 destroy_xen_mappings(rwva, rwva +
345 (1UL << L2_PAGETABLE_SHIFT));
346 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
347 }
348 }
350 i += 1UL < (L2_PAGETABLE_SHIFT - 2);
351 }
353 return;
354 }
356 void destroy_m2p_mapping(struct mem_hotadd_info *info)
357 {
358 l3_pgentry_t *l3_ro_mpt;
359 unsigned long i, va, rwva;
360 unsigned long smap = info->spfn, emap = info->epfn;
362 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
364 /*
365 * No need to clean m2p structure existing before the hotplug
366 */
367 for (i = smap; i < emap;)
368 {
369 unsigned long pt_pfn;
370 l2_pgentry_t *l2_ro_mpt;
372 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
373 rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
375 /* 1G mapping should not be created by mem hotadd */
376 if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) ||
377 (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE))
378 {
379 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
380 (1UL << (L3_PAGETABLE_SHIFT - 3) );
381 continue;
382 }
384 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
385 if (!(l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT))
386 {
387 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
388 (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
389 continue;
390 }
392 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
393 if ( hotadd_mem_valid(pt_pfn, info) )
394 {
395 destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
397 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
398 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
399 }
400 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
401 (1UL << (L2_PAGETABLE_SHIFT - 3));
402 }
404 destroy_compat_m2p_mapping(info);
406 /* Brute-Force flush all TLB */
407 flush_tlb_all();
408 return;
409 }
411 /*
412 * Allocate and map the compatibility mode machine-to-phys table.
413 * spfn/epfn: the pfn ranges to be setup
414 * free_s/free_e: the pfn ranges that is free still
415 */
416 static int setup_compat_m2p_table(struct mem_hotadd_info *info)
417 {
418 unsigned long i, va, smap, emap, rwva, epfn = info->epfn;
419 unsigned int n, memflags;
420 l3_pgentry_t *l3_ro_mpt = NULL;
421 l2_pgentry_t *l2_ro_mpt = NULL;
422 struct page_info *l1_pg;
424 smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1));
426 /*
427 * Notice: For hot-added memory, only range below m2p_compat_vstart
428 * will be filled up (assuming memory is discontinous when booting).
429 */
430 if ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) )
431 return 0;
433 if (epfn > (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START))
434 epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
436 emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) &
437 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) );
439 va = HIRO_COMPAT_MPT_VIRT_START +
440 smap * sizeof(*compat_machine_to_phys_mapping);
441 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
443 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT);
445 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
447 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
448 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
449 sizeof(*compat_machine_to_phys_mapping))
450 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
451 sizeof(*compat_machine_to_phys_mapping));
453 for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) )
454 {
455 va = HIRO_COMPAT_MPT_VIRT_START +
456 i * sizeof(*compat_machine_to_phys_mapping);
458 rwva = RDWR_COMPAT_MPT_VIRT_START +
459 i * sizeof(*compat_machine_to_phys_mapping);
461 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
462 continue;
464 for ( n = 0; n < CNT; ++n)
465 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
466 break;
467 if ( n == CNT )
468 continue;
470 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
472 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
473 map_pages_to_xen(rwva,
474 page_to_mfn(l1_pg),
475 1UL << PAGETABLE_ORDER,
476 PAGE_HYPERVISOR);
477 memset((void *)rwva, 0x55, 1UL << L2_PAGETABLE_SHIFT);
478 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
479 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
480 }
481 #undef CNT
482 #undef MFN
483 return 0;
484 }
486 /*
487 * Allocate and map the machine-to-phys table.
488 * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already
489 */
490 int setup_m2p_table(struct mem_hotadd_info *info)
491 {
492 unsigned long i, va, smap, emap;
493 unsigned int n, memflags;
494 l2_pgentry_t *l2_ro_mpt = NULL;
495 l3_pgentry_t *l3_ro_mpt = NULL;
496 struct page_info *l1_pg, *l2_pg;
497 int ret = 0;
499 ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)])
500 & _PAGE_PRESENT);
501 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
503 smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)));
504 emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) &
505 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1));
507 va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping);
509 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
510 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
511 sizeof(*machine_to_phys_mapping))
513 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
514 sizeof(*machine_to_phys_mapping));
516 i = smap;
517 while ( i < emap )
518 {
519 switch ( m2p_mapped(i) )
520 {
521 case M2P_1G_MAPPED:
522 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
523 (1UL << (L3_PAGETABLE_SHIFT - 3));
524 continue;
525 case M2P_2M_MAPPED:
526 i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
527 (1UL << (L2_PAGETABLE_SHIFT - 3));
528 continue;
529 default:
530 break;
531 }
533 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
534 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
536 for ( n = 0; n < CNT; ++n)
537 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
538 break;
539 if ( n == CNT )
540 l1_pg = NULL;
541 else
542 {
543 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
544 map_pages_to_xen(
545 RDWR_MPT_VIRT_START + i * sizeof(unsigned long),
546 page_to_mfn(l1_pg),
547 1UL << PAGETABLE_ORDER,
548 PAGE_HYPERVISOR);
549 memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)),
550 0x55, 1UL << L2_PAGETABLE_SHIFT);
552 ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
553 _PAGE_PSE));
554 if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
555 _PAGE_PRESENT )
556 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) +
557 l2_table_offset(va);
558 else
559 {
560 l2_pg = alloc_domheap_page(NULL, memflags);
562 if (!l2_pg)
563 {
564 ret = -ENOMEM;
565 goto error;
566 }
568 l2_ro_mpt = page_to_virt(l2_pg);
569 clear_page(l2_ro_mpt);
570 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
571 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
572 l2_ro_mpt += l2_table_offset(va);
573 }
575 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
576 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg,
577 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
578 }
579 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
580 l2_ro_mpt = NULL;
581 i += ( 1UL << (L2_PAGETABLE_SHIFT - 3));
582 }
583 #undef CNT
584 #undef MFN
586 ret = setup_compat_m2p_table(info);
587 error:
588 return ret;
589 }
591 void __init paging_init(void)
592 {
593 unsigned long i, mpt_size, va;
594 unsigned int n, memflags;
595 l3_pgentry_t *l3_ro_mpt;
596 l2_pgentry_t *l2_ro_mpt = NULL;
597 struct page_info *l1_pg, *l2_pg, *l3_pg;
599 /*
600 * We setup the L3s for 1:1 mapping if host support memory hotplug
601 * to avoid sync the 1:1 mapping on page fault handler
602 */
603 if ( mem_hotplug )
604 {
605 unsigned long va;
607 for ( va = DIRECTMAP_VIRT_START;
608 va < DIRECTMAP_VIRT_END;
609 va += (1UL << L4_PAGETABLE_SHIFT) )
610 {
611 if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
612 _PAGE_PRESENT) )
613 {
614 l3_pg = alloc_domheap_page(NULL, 0);
615 if ( !l3_pg )
616 goto nomem;
617 l3_ro_mpt = page_to_virt(l3_pg);
618 clear_page(l3_ro_mpt);
619 l4e_write(&idle_pg_table[l4_table_offset(va)],
620 l4e_from_page(l3_pg, __PAGE_HYPERVISOR));
621 }
622 }
623 }
625 /* Create user-accessible L2 directory to map the MPT for guests. */
626 if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL )
627 goto nomem;
628 l3_ro_mpt = page_to_virt(l3_pg);
629 clear_page(l3_ro_mpt);
630 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
631 l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER));
633 /*
634 * Allocate and map the machine-to-phys table.
635 * This also ensures L3 is present for fixmaps.
636 */
637 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
638 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
639 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
640 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
641 sizeof(*machine_to_phys_mapping))
642 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
643 sizeof(*machine_to_phys_mapping));
644 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
645 {
646 BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
647 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
648 memflags = MEMF_node(phys_to_nid(i <<
649 (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
651 if ( cpu_has_page1gb &&
652 !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
653 (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) )
654 {
655 unsigned int k, holes;
657 for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
658 {
659 for ( n = 0; n < CNT; ++n)
660 if ( mfn_valid(MFN(i + k) + n * PDX_GROUP_COUNT) )
661 break;
662 if ( n == CNT )
663 ++holes;
664 }
665 if ( k == holes )
666 {
667 i += (1UL << PAGETABLE_ORDER) - 1;
668 continue;
669 }
670 if ( holes == 0 &&
671 (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
672 memflags)) != NULL )
673 {
674 map_pages_to_xen(
675 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
676 page_to_mfn(l1_pg),
677 1UL << (2 * PAGETABLE_ORDER),
678 PAGE_HYPERVISOR);
679 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
680 0x77, 1UL << L3_PAGETABLE_SHIFT);
682 ASSERT(!l2_table_offset(va));
683 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
684 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
685 l3e_from_page(l1_pg,
686 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
687 i += (1UL << PAGETABLE_ORDER) - 1;
688 continue;
689 }
690 }
692 for ( n = 0; n < CNT; ++n)
693 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
694 break;
695 if ( n == CNT )
696 l1_pg = NULL;
697 else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
698 memflags)) == NULL )
699 goto nomem;
700 else
701 {
702 map_pages_to_xen(
703 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
704 page_to_mfn(l1_pg),
705 1UL << PAGETABLE_ORDER,
706 PAGE_HYPERVISOR);
707 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
708 0x55, 1UL << L2_PAGETABLE_SHIFT);
709 }
710 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
711 {
712 if ( (l2_pg = alloc_domheap_page(NULL, memflags)) == NULL )
713 goto nomem;
714 l2_ro_mpt = page_to_virt(l2_pg);
715 clear_page(l2_ro_mpt);
716 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
717 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
718 ASSERT(!l2_table_offset(va));
719 }
720 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
721 if ( l1_pg )
722 l2e_write(l2_ro_mpt, l2e_from_page(
723 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
724 l2_ro_mpt++;
725 }
726 #undef CNT
727 #undef MFN
729 /* Create user-accessible L2 directory to map the MPT for compat guests. */
730 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
731 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
732 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
733 HIRO_COMPAT_MPT_VIRT_START)]);
734 if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
735 goto nomem;
736 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
737 clear_page(l2_ro_mpt);
738 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
739 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
740 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
741 /* Allocate and map the compatibility mode machine-to-phys table. */
742 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
743 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
744 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
745 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
746 if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
747 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
748 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
749 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
750 sizeof(*compat_machine_to_phys_mapping))
751 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
752 sizeof(*compat_machine_to_phys_mapping));
753 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ )
754 {
755 memflags = MEMF_node(phys_to_nid(i <<
756 (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
757 for ( n = 0; n < CNT; ++n)
758 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
759 break;
760 if ( n == CNT )
761 continue;
762 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
763 memflags)) == NULL )
764 goto nomem;
765 map_pages_to_xen(
766 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
767 page_to_mfn(l1_pg),
768 1UL << PAGETABLE_ORDER,
769 PAGE_HYPERVISOR);
770 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
771 (i << L2_PAGETABLE_SHIFT)),
772 0x55,
773 1UL << L2_PAGETABLE_SHIFT);
774 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
775 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
776 }
777 #undef CNT
778 #undef MFN
780 /* Set up linear page table mapping. */
781 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
782 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
783 return;
785 nomem:
786 panic("Not enough memory for m2p table\n");
787 }
789 void __init setup_idle_pagetable(void)
790 {
791 /* Install per-domain mappings for idle domain. */
792 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
793 l4e_from_page(
794 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
795 __PAGE_HYPERVISOR));
796 }
798 void __init zap_low_mappings(void)
799 {
800 BUG_ON(num_online_cpus() != 1);
802 /* Remove aliased mapping of first 1:1 PML4 entry. */
803 l4e_write(&idle_pg_table[0], l4e_empty());
804 flush_local(FLUSH_TLB_GLOBAL);
806 /* Replace with mapping of the boot trampoline only. */
807 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
808 0x10, __PAGE_HYPERVISOR);
809 }
811 int __cpuinit setup_compat_arg_xlat(unsigned int cpu, int node)
812 {
813 unsigned int order = get_order_from_bytes(COMPAT_ARG_XLAT_SIZE);
814 unsigned long sz = PAGE_SIZE << order;
815 unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
816 struct page_info *pg;
818 pg = alloc_domheap_pages(NULL, order, memflags);
819 if ( !pg )
820 return -ENOMEM;
822 for ( ; (sz -= PAGE_SIZE) >= COMPAT_ARG_XLAT_SIZE; ++pg )
823 free_domheap_page(pg);
825 per_cpu(compat_arg_xlat, cpu) = page_to_virt(pg);
827 return 0;
828 }
830 void cleanup_frame_table(struct mem_hotadd_info *info)
831 {
832 unsigned long sva, eva;
833 l3_pgentry_t l3e;
834 l2_pgentry_t l2e;
835 unsigned long spfn, epfn;
837 spfn = info->spfn;
838 epfn = info->epfn;
840 sva = (unsigned long)pdx_to_page(pfn_to_pdx(spfn));
841 eva = (unsigned long)pdx_to_page(pfn_to_pdx(epfn));
843 /* Intialize all page */
844 memset(mfn_to_page(spfn), -1, mfn_to_page(epfn) - mfn_to_page(spfn));
846 while (sva < eva)
847 {
848 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(sva)])[
849 l3_table_offset(sva)];
850 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
851 (l3e_get_flags(l3e) & _PAGE_PSE) )
852 {
853 sva = (sva & ~((1UL << L3_PAGETABLE_SHIFT) - 1)) +
854 (1UL << L3_PAGETABLE_SHIFT);
855 continue;
856 }
858 l2e = l3e_to_l2e(l3e)[l2_table_offset(sva)];
859 ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
861 if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) ==
862 (_PAGE_PSE | _PAGE_PRESENT) )
863 {
864 if (hotadd_mem_valid(l2e_get_pfn(l2e), info))
865 destroy_xen_mappings(sva & ~((1UL << L2_PAGETABLE_SHIFT) - 1),
866 ((sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
867 (1UL << L2_PAGETABLE_SHIFT) - 1));
869 sva = (sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
870 (1UL << L2_PAGETABLE_SHIFT);
871 continue;
872 }
874 ASSERT(l1e_get_flags(l2e_to_l1e(l2e)[l1_table_offset(sva)]) &
875 _PAGE_PRESENT);
876 sva = (sva & ~((1UL << PAGE_SHIFT) - 1)) +
877 (1UL << PAGE_SHIFT);
878 }
880 /* Brute-Force flush all TLB */
881 flush_tlb_all();
882 }
884 /* Should we be paraniod failure in map_pages_to_xen? */
885 static int setup_frametable_chunk(void *start, void *end,
886 struct mem_hotadd_info *info)
887 {
888 unsigned long s = (unsigned long)start;
889 unsigned long e = (unsigned long)end;
890 unsigned long mfn;
892 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
893 ASSERT(!(e & ((1 << L2_PAGETABLE_SHIFT) - 1)));
895 for ( ; s < e; s += (1UL << L2_PAGETABLE_SHIFT))
896 {
897 mfn = alloc_hotadd_mfn(info);
898 map_pages_to_xen(s, mfn, 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR);
899 }
900 memset(start, -1, s - (unsigned long)start);
902 return 0;
903 }
905 int extend_frame_table(struct mem_hotadd_info *info)
906 {
907 unsigned long cidx, nidx, eidx, spfn, epfn;
909 spfn = info->spfn;
910 epfn = info->epfn;
912 eidx = (pfn_to_pdx(epfn) + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
913 nidx = cidx = pfn_to_pdx(spfn)/PDX_GROUP_COUNT;
915 ASSERT( pfn_to_pdx(epfn) <= (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
916 (pfn_to_pdx(epfn) <= FRAMETABLE_SIZE / sizeof(struct page_info)) );
918 if ( test_bit(cidx, pdx_group_valid) )
919 cidx = find_next_zero_bit(pdx_group_valid, eidx, cidx);
921 if ( cidx >= eidx )
922 return 0;
924 while ( cidx < eidx )
925 {
926 nidx = find_next_bit(pdx_group_valid, eidx, cidx);
927 if ( nidx >= eidx )
928 nidx = eidx;
929 setup_frametable_chunk(pdx_to_page(cidx * PDX_GROUP_COUNT ),
930 pdx_to_page(nidx * PDX_GROUP_COUNT),
931 info);
933 cidx = find_next_zero_bit(pdx_group_valid, eidx, nidx);
934 }
936 memset(mfn_to_page(spfn), 0, mfn_to_page(epfn) - mfn_to_page(spfn));
937 return 0;
938 }
940 void __init subarch_init_memory(void)
941 {
942 unsigned long i, n, v, m2p_start_mfn;
943 l3_pgentry_t l3e;
944 l2_pgentry_t l2e;
946 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
947 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
948 /* M2P table is mappable read-only by privileged domains. */
949 for ( v = RDWR_MPT_VIRT_START;
950 v != RDWR_MPT_VIRT_END;
951 v += n << PAGE_SHIFT )
952 {
953 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
954 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
955 l3_table_offset(v)];
956 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
957 continue;
958 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
959 {
960 n = L1_PAGETABLE_ENTRIES;
961 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
962 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
963 continue;
964 m2p_start_mfn = l2e_get_pfn(l2e);
965 }
966 else
967 {
968 m2p_start_mfn = l3e_get_pfn(l3e);
969 }
971 for ( i = 0; i < n; i++ )
972 {
973 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
974 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
975 }
976 }
978 for ( v = RDWR_COMPAT_MPT_VIRT_START;
979 v != RDWR_COMPAT_MPT_VIRT_END;
980 v += 1 << L2_PAGETABLE_SHIFT )
981 {
982 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
983 l3_table_offset(v)];
984 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
985 continue;
986 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
987 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
988 continue;
989 m2p_start_mfn = l2e_get_pfn(l2e);
991 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
992 {
993 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
994 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
995 }
996 }
998 if ( setup_compat_arg_xlat(smp_processor_id(),
999 apicid_to_node[boot_cpu_physical_apicid]) )
1000 panic("Could not setup argument translation area");
1003 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
1005 struct xen_machphys_mfn_list xmml;
1006 l3_pgentry_t l3e;
1007 l2_pgentry_t l2e;
1008 unsigned long v;
1009 xen_pfn_t mfn, last_mfn;
1010 unsigned int i;
1011 long rc = 0;
1013 switch ( op )
1015 case XENMEM_machphys_mfn_list:
1016 if ( copy_from_guest(&xmml, arg, 1) )
1017 return -EFAULT;
1019 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
1020 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
1021 for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0;
1022 (i != xmml.max_extents) &&
1023 (v < (unsigned long)(machine_to_phys_mapping + max_page));
1024 i++, v += 1UL << L2_PAGETABLE_SHIFT )
1026 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
1027 l3_table_offset(v)];
1028 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1029 mfn = last_mfn;
1030 else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
1032 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
1033 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
1034 mfn = l2e_get_pfn(l2e);
1035 else
1036 mfn = last_mfn;
1038 else
1040 mfn = l3e_get_pfn(l3e)
1041 + (l2_table_offset(v) << PAGETABLE_ORDER);
1043 ASSERT(mfn);
1044 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
1045 return -EFAULT;
1046 last_mfn = mfn;
1049 xmml.nr_extents = i;
1050 if ( copy_to_guest(arg, &xmml, 1) )
1051 return -EFAULT;
1053 break;
1055 default:
1056 rc = -ENOSYS;
1057 break;
1060 return rc;
1063 long do_stack_switch(unsigned long ss, unsigned long esp)
1065 fixup_guest_stack_selector(current->domain, ss);
1066 current->arch.guest_context.kernel_ss = ss;
1067 current->arch.guest_context.kernel_sp = esp;
1068 return 0;
1071 long do_set_segment_base(unsigned int which, unsigned long base)
1073 struct vcpu *v = current;
1074 long ret = 0;
1076 switch ( which )
1078 case SEGBASE_FS:
1079 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
1080 ret = -EFAULT;
1081 else
1082 v->arch.guest_context.fs_base = base;
1083 break;
1085 case SEGBASE_GS_USER:
1086 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
1087 ret = -EFAULT;
1088 else
1089 v->arch.guest_context.gs_base_user = base;
1090 break;
1092 case SEGBASE_GS_KERNEL:
1093 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
1094 ret = -EFAULT;
1095 else
1096 v->arch.guest_context.gs_base_kernel = base;
1097 break;
1099 case SEGBASE_GS_USER_SEL:
1100 __asm__ __volatile__ (
1101 " swapgs \n"
1102 "1: movl %k0,%%gs \n"
1103 " "safe_swapgs" \n"
1104 ".section .fixup,\"ax\" \n"
1105 "2: xorl %k0,%k0 \n"
1106 " jmp 1b \n"
1107 ".previous \n"
1108 ".section __ex_table,\"a\"\n"
1109 " .align 8 \n"
1110 " .quad 1b,2b \n"
1111 ".previous "
1112 : : "r" (base&0xffff) );
1113 break;
1115 default:
1116 ret = -EINVAL;
1117 break;
1120 return ret;
1124 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
1125 int check_descriptor(const struct domain *dom, struct desc_struct *d)
1127 u32 a = d->a, b = d->b;
1128 u16 cs;
1129 unsigned int dpl;
1131 /* A not-present descriptor will always fault, so is safe. */
1132 if ( !(b & _SEGMENT_P) )
1133 goto good;
1135 /* Check and fix up the DPL. */
1136 dpl = (b >> 13) & 3;
1137 __fixup_guest_selector(dom, dpl);
1138 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
1140 /* All code and data segments are okay. No base/limit checking. */
1141 if ( (b & _SEGMENT_S) )
1143 if ( is_pv_32bit_domain(dom) )
1145 unsigned long base, limit;
1147 if ( b & _SEGMENT_L )
1148 goto bad;
1150 /*
1151 * Older PAE Linux guests use segments which are limited to
1152 * 0xf6800000. Extend these to allow access to the larger read-only
1153 * M2P table available in 32on64 mode.
1154 */
1155 base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16);
1157 limit = (b & 0xf0000) | (a & 0xffff);
1158 limit++; /* We add one because limit is inclusive. */
1160 if ( (b & _SEGMENT_G) )
1161 limit <<= 12;
1163 if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
1165 a |= 0x0000ffff;
1166 b |= 0x000f0000;
1170 goto good;
1173 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
1174 if ( (b & _SEGMENT_TYPE) == 0x000 )
1175 goto good;
1177 /* Everything but a call gate is discarded here. */
1178 if ( (b & _SEGMENT_TYPE) != 0xc00 )
1179 goto bad;
1181 /* Validate the target code selector. */
1182 cs = a >> 16;
1183 if ( !guest_gate_selector_okay(dom, cs) )
1184 goto bad;
1185 /*
1186 * Force DPL to zero, causing a GP fault with its error code indicating
1187 * the gate in use, allowing emulation. This is necessary because with
1188 * native guests (kernel in ring 3) call gates cannot be used directly
1189 * to transition from user to kernel mode (and whether a gate is used
1190 * to enter the kernel can only be determined when the gate is being
1191 * used), and with compat guests call gates cannot be used at all as
1192 * there are only 64-bit ones.
1193 * Store the original DPL in the selector's RPL field.
1194 */
1195 b &= ~_SEGMENT_DPL;
1196 cs = (cs & ~3) | dpl;
1197 a = (a & 0xffffU) | (cs << 16);
1199 /* Reserved bits must be zero. */
1200 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
1201 goto bad;
1203 good:
1204 d->a = a;
1205 d->b = b;
1206 return 1;
1207 bad:
1208 return 0;
1211 int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs)
1213 struct domain *d = current->domain;
1215 if (guest_mode(regs) &&
1216 is_pv_32bit_domain(d) &&
1217 ((addr >= HYPERVISOR_COMPAT_VIRT_START(d)) &&
1218 (addr < MACH2PHYS_COMPAT_VIRT_END)) )
1219 return 1;
1220 return 0;
1223 int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
1225 struct domain *d = current->domain;
1226 l4_pgentry_t *pl4e = NULL;
1227 l4_pgentry_t l4e;
1228 l3_pgentry_t *pl3e = NULL;
1229 l3_pgentry_t l3e;
1230 l2_pgentry_t *pl2e = NULL;
1231 l2_pgentry_t l2e, idle_l2e;
1232 unsigned long mfn, idle_index;
1233 int ret = 0;
1235 if (!is_pv_32on64_domain(d))
1236 return 0;
1238 if ((addr < HYPERVISOR_COMPAT_VIRT_START(d)) ||
1239 (addr > MACH2PHYS_COMPAT_VIRT_END) )
1240 return 0;
1242 mfn = (read_cr3()) >> PAGE_SHIFT;
1244 pl4e = map_domain_page(mfn);
1246 l4e = pl4e[addr];
1248 if (!(l4e_get_flags(l4e) & _PAGE_PRESENT))
1249 goto unmap;
1251 mfn = l4e_get_pfn(l4e);
1252 /* We don't need get page type here since it is current CR3 */
1253 pl3e = map_domain_page(mfn);
1255 l3e = pl3e[3];
1257 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1258 goto unmap;
1260 mfn = l3e_get_pfn(l3e);
1261 pl2e = map_domain_page(mfn);
1263 l2e = pl2e[l2_table_offset(addr)];
1265 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT))
1266 goto unmap;
1268 idle_index = (l2_table_offset(addr) -
1269 COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d))/
1270 sizeof(l2_pgentry_t);
1271 idle_l2e = compat_idle_pg_table_l2[idle_index];
1272 if (!(l2e_get_flags(idle_l2e) & _PAGE_PRESENT))
1273 goto unmap;
1275 memcpy(&pl2e[l2_table_offset(addr)],
1276 &compat_idle_pg_table_l2[idle_index],
1277 sizeof(l2_pgentry_t));
1279 ret = EXCRET_fault_fixed;
1281 unmap:
1282 if ( pl4e )
1283 unmap_domain_page(pl4e);
1284 if ( pl3e )
1285 unmap_domain_page(pl3e);
1286 if ( pl2e )
1287 unmap_domain_page(pl2e);
1289 return ret;
1292 void domain_set_alloc_bitsize(struct domain *d)
1294 if ( !is_pv_32on64_domain(d) ||
1295 (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
1296 d->arch.physaddr_bitsize > 0 )
1297 return;
1298 d->arch.physaddr_bitsize =
1299 /* 2^n entries can be contained in guest's p2m mapping space */
1300 fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
1301 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
1302 + PAGE_SHIFT;
1305 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
1307 if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
1308 return bits;
1309 return min(d->arch.physaddr_bitsize, bits);
1312 #include "compat/mm.c"
1314 /*
1315 * Local variables:
1316 * mode: C
1317 * c-set-style: "BSD"
1318 * c-basic-offset: 4
1319 * tab-width: 4
1320 * indent-tabs-mode: nil
1321 * End:
1322 */