debuggers.hg

view xen/arch/x86/x86_64/mm.c @ 22906:700ac6445812

Now add KDB to the non-kdb tree
author Mukesh Rathor
date Thu Feb 03 15:42:41 2011 -0800 (2011-02-03)
parents 2762b6d3149c
children
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/numa.h>
26 #include <xen/nodemask.h>
27 #include <xen/guest_access.h>
28 #include <asm/current.h>
29 #include <asm/asm_defns.h>
30 #include <asm/page.h>
31 #include <asm/flushtlb.h>
32 #include <asm/fixmap.h>
33 #include <asm/hypercall.h>
34 #include <asm/msr.h>
35 #include <asm/setup.h>
36 #include <asm/numa.h>
37 #include <public/memory.h>
39 /* Parameters for PFN/MADDR compression. */
40 unsigned long __read_mostly max_pdx;
41 unsigned long __read_mostly pfn_pdx_bottom_mask = ~0UL;
42 unsigned long __read_mostly ma_va_bottom_mask = ~0UL;
43 unsigned long __read_mostly pfn_top_mask = 0;
44 unsigned long __read_mostly ma_top_mask = 0;
45 unsigned long __read_mostly pfn_hole_mask = 0;
46 unsigned int __read_mostly pfn_pdx_hole_shift = 0;
48 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
50 /* Top-level master (and idle-domain) page directory. */
51 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
52 idle_pg_table[L4_PAGETABLE_ENTRIES];
54 /* Enough page directories to map bottom 4GB of the memory map. */
55 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
56 l3_identmap[L3_PAGETABLE_ENTRIES];
57 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
58 l2_identmap[4*L2_PAGETABLE_ENTRIES];
60 /* Enough page directories to map the Xen text and static data. */
61 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
62 l3_xenmap[L3_PAGETABLE_ENTRIES];
63 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
64 l2_xenmap[L2_PAGETABLE_ENTRIES];
66 /* Enough page directories to map into the bottom 1GB. */
67 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
68 l3_bootmap[L3_PAGETABLE_ENTRIES];
69 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
70 l2_bootmap[L2_PAGETABLE_ENTRIES];
72 int __mfn_valid(unsigned long mfn)
73 {
74 return likely(mfn < max_page) &&
75 likely(!(mfn & pfn_hole_mask)) &&
76 likely(test_bit(pfn_to_pdx(mfn) / PDX_GROUP_COUNT,
77 pdx_group_valid));
78 }
80 void *alloc_xen_pagetable(void)
81 {
82 unsigned long mfn;
84 if ( !early_boot )
85 {
86 struct page_info *pg = alloc_domheap_page(NULL, 0);
87 BUG_ON(pg == NULL);
88 return page_to_virt(pg);
89 }
91 mfn = alloc_boot_pages(1, 1);
92 return mfn_to_virt(mfn);
93 }
95 l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
96 {
97 l4_pgentry_t *pl4e;
99 pl4e = &idle_pg_table[l4_table_offset(v)];
100 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
101 {
102 l3_pgentry_t *pl3e = alloc_xen_pagetable();
103 clear_page(pl3e);
104 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
105 }
107 return l4e_to_l3e(*pl4e) + l3_table_offset(v);
108 }
110 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
111 {
112 l3_pgentry_t *pl3e;
114 pl3e = virt_to_xen_l3e(v);
115 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
116 {
117 l2_pgentry_t *pl2e = alloc_xen_pagetable();
118 clear_page(pl2e);
119 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
120 }
122 BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
123 return l3e_to_l2e(*pl3e) + l2_table_offset(v);
124 }
126 void *do_page_walk(struct vcpu *v, unsigned long addr)
127 {
128 unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
129 l4_pgentry_t l4e, *l4t;
130 l3_pgentry_t l3e, *l3t;
131 l2_pgentry_t l2e, *l2t;
132 l1_pgentry_t l1e, *l1t;
134 if ( is_hvm_vcpu(v) )
135 return NULL;
137 l4t = mfn_to_virt(mfn);
138 l4e = l4t[l4_table_offset(addr)];
139 mfn = l4e_get_pfn(l4e);
140 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
141 return NULL;
143 l3t = mfn_to_virt(mfn);
144 l3e = l3t[l3_table_offset(addr)];
145 mfn = l3e_get_pfn(l3e);
146 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
147 return NULL;
148 if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
149 return mfn_to_virt(mfn) + (addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
151 l2t = mfn_to_virt(mfn);
152 l2e = l2t[l2_table_offset(addr)];
153 mfn = l2e_get_pfn(l2e);
154 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
155 return NULL;
156 if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
157 return mfn_to_virt(mfn) + (addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
159 l1t = mfn_to_virt(mfn);
160 l1e = l1t[l1_table_offset(addr)];
161 mfn = l1e_get_pfn(l1e);
162 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
163 return NULL;
165 return mfn_to_virt(mfn) + (addr & ~PAGE_MASK);
166 }
168 void __init pfn_pdx_hole_setup(unsigned long mask)
169 {
170 unsigned int i, j, bottom_shift = 0, hole_shift = 0;
172 /*
173 * We skip the first MAX_ORDER bits, as we never want to compress them.
174 * This guarantees that page-pointer arithmetic remains valid within
175 * contiguous aligned ranges of 2^MAX_ORDER pages. Among others, our
176 * buddy allocator relies on this assumption.
177 */
178 for ( j = MAX_ORDER-1; ; )
179 {
180 i = find_next_zero_bit(&mask, BITS_PER_LONG, j);
181 j = find_next_bit(&mask, BITS_PER_LONG, i);
182 if ( j >= BITS_PER_LONG )
183 break;
184 if ( j - i > hole_shift )
185 {
186 hole_shift = j - i;
187 bottom_shift = i;
188 }
189 }
190 if ( !hole_shift )
191 return;
193 printk(KERN_INFO "PFN compression on bits %u...%u\n",
194 bottom_shift, bottom_shift + hole_shift - 1);
196 pfn_pdx_hole_shift = hole_shift;
197 pfn_pdx_bottom_mask = (1UL << bottom_shift) - 1;
198 ma_va_bottom_mask = (PAGE_SIZE << bottom_shift) - 1;
199 pfn_hole_mask = ((1UL << hole_shift) - 1) << bottom_shift;
200 pfn_top_mask = ~(pfn_pdx_bottom_mask | pfn_hole_mask);
201 ma_top_mask = pfn_top_mask << PAGE_SHIFT;
202 }
204 /*
205 * Allocate page table pages for m2p table
206 */
207 struct mem_hotadd_info
208 {
209 unsigned long spfn;
210 unsigned long epfn;
211 unsigned long cur;
212 };
214 int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info)
215 {
216 return (pfn < info->epfn && pfn >= info->spfn);
217 }
219 static unsigned long alloc_hotadd_mfn(struct mem_hotadd_info *info)
220 {
221 unsigned mfn;
223 ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) &&
224 info->cur >= info->spfn);
226 mfn = info->cur;
227 info->cur += (1UL << PAGETABLE_ORDER);
228 return mfn;
229 }
231 #define M2P_NO_MAPPED 0
232 #define M2P_2M_MAPPED 1
233 #define M2P_1G_MAPPED 2
234 static int m2p_mapped(unsigned long spfn)
235 {
236 unsigned long va;
237 l3_pgentry_t *l3_ro_mpt;
238 l2_pgentry_t *l2_ro_mpt;
240 va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping);
241 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
243 switch ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
244 (_PAGE_PRESENT |_PAGE_PSE))
245 {
246 case _PAGE_PSE|_PAGE_PRESENT:
247 return M2P_1G_MAPPED;
248 break;
249 /* Check for next level */
250 case _PAGE_PRESENT:
251 break;
252 default:
253 return M2P_NO_MAPPED;
254 break;
255 }
256 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
258 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
259 return M2P_2M_MAPPED;
261 return M2P_NO_MAPPED;
262 }
264 int share_hotadd_m2p_table(struct mem_hotadd_info *info)
265 {
266 unsigned long i, n, v, m2p_start_mfn = 0;
267 l3_pgentry_t l3e;
268 l2_pgentry_t l2e;
270 /* M2P table is mappable read-only by privileged domains. */
271 for ( v = RDWR_MPT_VIRT_START;
272 v != RDWR_MPT_VIRT_END;
273 v += n << PAGE_SHIFT )
274 {
275 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
276 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
277 l3_table_offset(v)];
278 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
279 continue;
280 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
281 {
282 n = L1_PAGETABLE_ENTRIES;
283 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
284 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
285 continue;
286 m2p_start_mfn = l2e_get_pfn(l2e);
287 }
288 else
289 continue;
291 for ( i = 0; i < n; i++ )
292 {
293 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
294 if (hotadd_mem_valid(m2p_start_mfn + i, info))
295 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
296 }
297 }
299 for ( v = RDWR_COMPAT_MPT_VIRT_START;
300 v != RDWR_COMPAT_MPT_VIRT_END;
301 v += 1 << L2_PAGETABLE_SHIFT )
302 {
303 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
304 l3_table_offset(v)];
305 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
306 continue;
307 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
308 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
309 continue;
310 m2p_start_mfn = l2e_get_pfn(l2e);
312 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
313 {
314 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
315 if (hotadd_mem_valid(m2p_start_mfn + i, info))
316 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
317 }
318 }
319 return 0;
320 }
322 static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info)
323 {
324 unsigned long i, va, rwva, pt_pfn;
325 unsigned long smap = info->spfn, emap = info->spfn;
327 l3_pgentry_t *l3_ro_mpt;
328 l2_pgentry_t *l2_ro_mpt;
330 if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
331 return;
333 if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
334 emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
336 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
338 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]) & _PAGE_PRESENT);
340 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
342 for ( i = smap; i < emap; )
343 {
344 va = HIRO_COMPAT_MPT_VIRT_START +
345 i * sizeof(*compat_machine_to_phys_mapping);
346 rwva = RDWR_COMPAT_MPT_VIRT_START +
347 i * sizeof(*compat_machine_to_phys_mapping);
348 if ( l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT )
349 {
350 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
351 if ( hotadd_mem_valid(pt_pfn, info) )
352 {
353 destroy_xen_mappings(rwva, rwva +
354 (1UL << L2_PAGETABLE_SHIFT));
355 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
356 }
357 }
359 i += 1UL < (L2_PAGETABLE_SHIFT - 2);
360 }
362 return;
363 }
365 void destroy_m2p_mapping(struct mem_hotadd_info *info)
366 {
367 l3_pgentry_t *l3_ro_mpt;
368 unsigned long i, va, rwva;
369 unsigned long smap = info->spfn, emap = info->epfn;
371 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
373 /*
374 * No need to clean m2p structure existing before the hotplug
375 */
376 for (i = smap; i < emap;)
377 {
378 unsigned long pt_pfn;
379 l2_pgentry_t *l2_ro_mpt;
381 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
382 rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
384 /* 1G mapping should not be created by mem hotadd */
385 if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) ||
386 (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE))
387 {
388 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
389 (1UL << (L3_PAGETABLE_SHIFT - 3) );
390 continue;
391 }
393 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
394 if (!(l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT))
395 {
396 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
397 (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
398 continue;
399 }
401 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
402 if ( hotadd_mem_valid(pt_pfn, info) )
403 {
404 destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
406 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
407 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
408 }
409 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
410 (1UL << (L2_PAGETABLE_SHIFT - 3));
411 }
413 destroy_compat_m2p_mapping(info);
415 /* Brute-Force flush all TLB */
416 flush_tlb_all();
417 return;
418 }
420 /*
421 * Allocate and map the compatibility mode machine-to-phys table.
422 * spfn/epfn: the pfn ranges to be setup
423 * free_s/free_e: the pfn ranges that is free still
424 */
425 static int setup_compat_m2p_table(struct mem_hotadd_info *info)
426 {
427 unsigned long i, va, smap, emap, rwva, epfn = info->epfn;
428 unsigned int n, memflags;
429 l3_pgentry_t *l3_ro_mpt = NULL;
430 l2_pgentry_t *l2_ro_mpt = NULL;
431 struct page_info *l1_pg;
433 smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1));
435 /*
436 * Notice: For hot-added memory, only range below m2p_compat_vstart
437 * will be filled up (assuming memory is discontinous when booting).
438 */
439 if ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) )
440 return 0;
442 if (epfn > (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START))
443 epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
445 emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) &
446 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) );
448 va = HIRO_COMPAT_MPT_VIRT_START +
449 smap * sizeof(*compat_machine_to_phys_mapping);
450 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
452 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT);
454 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
456 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
457 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
458 sizeof(*compat_machine_to_phys_mapping))
459 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
460 sizeof(*compat_machine_to_phys_mapping));
462 for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) )
463 {
464 va = HIRO_COMPAT_MPT_VIRT_START +
465 i * sizeof(*compat_machine_to_phys_mapping);
467 rwva = RDWR_COMPAT_MPT_VIRT_START +
468 i * sizeof(*compat_machine_to_phys_mapping);
470 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
471 continue;
473 for ( n = 0; n < CNT; ++n)
474 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
475 break;
476 if ( n == CNT )
477 continue;
479 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
481 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
482 map_pages_to_xen(rwva,
483 page_to_mfn(l1_pg),
484 1UL << PAGETABLE_ORDER,
485 PAGE_HYPERVISOR);
486 memset((void *)rwva, 0x55, 1UL << L2_PAGETABLE_SHIFT);
487 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
488 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
489 }
490 #undef CNT
491 #undef MFN
492 return 0;
493 }
495 /*
496 * Allocate and map the machine-to-phys table.
497 * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already
498 */
499 int setup_m2p_table(struct mem_hotadd_info *info)
500 {
501 unsigned long i, va, smap, emap;
502 unsigned int n, memflags;
503 l2_pgentry_t *l2_ro_mpt = NULL;
504 l3_pgentry_t *l3_ro_mpt = NULL;
505 struct page_info *l1_pg, *l2_pg;
506 int ret = 0;
508 ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)])
509 & _PAGE_PRESENT);
510 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
512 smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)));
513 emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) &
514 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1));
516 va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping);
518 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
519 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
520 sizeof(*machine_to_phys_mapping))
522 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
523 sizeof(*machine_to_phys_mapping));
525 i = smap;
526 while ( i < emap )
527 {
528 switch ( m2p_mapped(i) )
529 {
530 case M2P_1G_MAPPED:
531 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
532 (1UL << (L3_PAGETABLE_SHIFT - 3));
533 continue;
534 case M2P_2M_MAPPED:
535 i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
536 (1UL << (L2_PAGETABLE_SHIFT - 3));
537 continue;
538 default:
539 break;
540 }
542 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
543 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
545 for ( n = 0; n < CNT; ++n)
546 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
547 break;
548 if ( n == CNT )
549 l1_pg = NULL;
550 else
551 {
552 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
553 map_pages_to_xen(
554 RDWR_MPT_VIRT_START + i * sizeof(unsigned long),
555 page_to_mfn(l1_pg),
556 1UL << PAGETABLE_ORDER,
557 PAGE_HYPERVISOR);
558 memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)),
559 0x55, 1UL << L2_PAGETABLE_SHIFT);
561 ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
562 _PAGE_PSE));
563 if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
564 _PAGE_PRESENT )
565 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) +
566 l2_table_offset(va);
567 else
568 {
569 l2_pg = alloc_domheap_page(NULL, memflags);
571 if (!l2_pg)
572 {
573 ret = -ENOMEM;
574 goto error;
575 }
577 l2_ro_mpt = page_to_virt(l2_pg);
578 clear_page(l2_ro_mpt);
579 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
580 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
581 l2_ro_mpt += l2_table_offset(va);
582 }
584 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
585 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg,
586 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
587 }
588 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
589 l2_ro_mpt = NULL;
590 i += ( 1UL << (L2_PAGETABLE_SHIFT - 3));
591 }
592 #undef CNT
593 #undef MFN
595 ret = setup_compat_m2p_table(info);
596 error:
597 return ret;
598 }
600 void __init paging_init(void)
601 {
602 unsigned long i, mpt_size, va;
603 unsigned int n, memflags;
604 l3_pgentry_t *l3_ro_mpt;
605 l2_pgentry_t *l2_ro_mpt = NULL;
606 struct page_info *l1_pg, *l2_pg, *l3_pg;
608 /*
609 * We setup the L3s for 1:1 mapping if host support memory hotplug
610 * to avoid sync the 1:1 mapping on page fault handler
611 */
612 if ( mem_hotplug )
613 {
614 unsigned long va;
616 for ( va = DIRECTMAP_VIRT_START;
617 va < DIRECTMAP_VIRT_END;
618 va += (1UL << L4_PAGETABLE_SHIFT) )
619 {
620 if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
621 _PAGE_PRESENT) )
622 {
623 l3_pg = alloc_domheap_page(NULL, 0);
624 if ( !l3_pg )
625 goto nomem;
626 l3_ro_mpt = page_to_virt(l3_pg);
627 clear_page(l3_ro_mpt);
628 l4e_write(&idle_pg_table[l4_table_offset(va)],
629 l4e_from_page(l3_pg, __PAGE_HYPERVISOR));
630 }
631 }
632 }
634 /* Create user-accessible L2 directory to map the MPT for guests. */
635 if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL )
636 goto nomem;
637 l3_ro_mpt = page_to_virt(l3_pg);
638 clear_page(l3_ro_mpt);
639 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
640 l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER));
642 /*
643 * Allocate and map the machine-to-phys table.
644 * This also ensures L3 is present for fixmaps.
645 */
646 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
647 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
648 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
649 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
650 sizeof(*machine_to_phys_mapping))
651 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
652 sizeof(*machine_to_phys_mapping));
653 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
654 {
655 BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
656 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
657 memflags = MEMF_node(phys_to_nid(i <<
658 (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
660 if ( cpu_has_page1gb &&
661 !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
662 (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) )
663 {
664 unsigned int k, holes;
666 for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
667 {
668 for ( n = 0; n < CNT; ++n)
669 if ( mfn_valid(MFN(i + k) + n * PDX_GROUP_COUNT) )
670 break;
671 if ( n == CNT )
672 ++holes;
673 }
674 if ( k == holes )
675 {
676 i += (1UL << PAGETABLE_ORDER) - 1;
677 continue;
678 }
679 if ( holes == 0 &&
680 (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
681 memflags)) != NULL )
682 {
683 map_pages_to_xen(
684 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
685 page_to_mfn(l1_pg),
686 1UL << (2 * PAGETABLE_ORDER),
687 PAGE_HYPERVISOR);
688 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
689 0x77, 1UL << L3_PAGETABLE_SHIFT);
691 ASSERT(!l2_table_offset(va));
692 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
693 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
694 l3e_from_page(l1_pg,
695 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
696 i += (1UL << PAGETABLE_ORDER) - 1;
697 continue;
698 }
699 }
701 for ( n = 0; n < CNT; ++n)
702 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
703 break;
704 if ( n == CNT )
705 l1_pg = NULL;
706 else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
707 memflags)) == NULL )
708 goto nomem;
709 else
710 {
711 map_pages_to_xen(
712 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
713 page_to_mfn(l1_pg),
714 1UL << PAGETABLE_ORDER,
715 PAGE_HYPERVISOR);
716 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
717 0x55, 1UL << L2_PAGETABLE_SHIFT);
718 }
719 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
720 {
721 if ( (l2_pg = alloc_domheap_page(NULL, memflags)) == NULL )
722 goto nomem;
723 l2_ro_mpt = page_to_virt(l2_pg);
724 clear_page(l2_ro_mpt);
725 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
726 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
727 ASSERT(!l2_table_offset(va));
728 }
729 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
730 if ( l1_pg )
731 l2e_write(l2_ro_mpt, l2e_from_page(
732 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
733 l2_ro_mpt++;
734 }
735 #undef CNT
736 #undef MFN
738 /* Create user-accessible L2 directory to map the MPT for compat guests. */
739 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
740 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
741 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
742 HIRO_COMPAT_MPT_VIRT_START)]);
743 if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
744 goto nomem;
745 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
746 clear_page(l2_ro_mpt);
747 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
748 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
749 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
750 /* Allocate and map the compatibility mode machine-to-phys table. */
751 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
752 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
753 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
754 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
755 if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
756 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
757 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
758 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
759 sizeof(*compat_machine_to_phys_mapping))
760 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
761 sizeof(*compat_machine_to_phys_mapping));
762 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ )
763 {
764 memflags = MEMF_node(phys_to_nid(i <<
765 (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
766 for ( n = 0; n < CNT; ++n)
767 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
768 break;
769 if ( n == CNT )
770 continue;
771 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
772 memflags)) == NULL )
773 goto nomem;
774 map_pages_to_xen(
775 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
776 page_to_mfn(l1_pg),
777 1UL << PAGETABLE_ORDER,
778 PAGE_HYPERVISOR);
779 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
780 (i << L2_PAGETABLE_SHIFT)),
781 0x55,
782 1UL << L2_PAGETABLE_SHIFT);
783 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
784 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
785 }
786 #undef CNT
787 #undef MFN
789 /* Set up linear page table mapping. */
790 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
791 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
792 return;
794 nomem:
795 panic("Not enough memory for m2p table\n");
796 }
798 void __init setup_idle_pagetable(void)
799 {
800 /* Install per-domain mappings for idle domain. */
801 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
802 l4e_from_page(
803 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
804 __PAGE_HYPERVISOR));
805 }
807 void __init zap_low_mappings(void)
808 {
809 BUG_ON(num_online_cpus() != 1);
811 /* Remove aliased mapping of first 1:1 PML4 entry. */
812 l4e_write(&idle_pg_table[0], l4e_empty());
813 flush_local(FLUSH_TLB_GLOBAL);
815 /* Replace with mapping of the boot trampoline only. */
816 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
817 0x10, __PAGE_HYPERVISOR);
818 }
820 void *compat_arg_xlat_virt_base(void)
821 {
822 return current->arch.compat_arg_xlat;
823 }
825 int setup_compat_arg_xlat(struct vcpu *v)
826 {
827 unsigned int order = get_order_from_bytes(COMPAT_ARG_XLAT_SIZE);
828 struct page_info *pg;
830 pg = alloc_domheap_pages(NULL, order, 0);
831 if ( pg == NULL )
832 return -ENOMEM;
834 v->arch.compat_arg_xlat = page_to_virt(pg);
835 return 0;
836 }
838 void free_compat_arg_xlat(struct vcpu *v)
839 {
840 unsigned int order = get_order_from_bytes(COMPAT_ARG_XLAT_SIZE);
841 if ( v->arch.compat_arg_xlat != NULL )
842 free_domheap_pages(virt_to_page(v->arch.compat_arg_xlat), order);
843 v->arch.compat_arg_xlat = NULL;
844 }
846 void cleanup_frame_table(struct mem_hotadd_info *info)
847 {
848 unsigned long sva, eva;
849 l3_pgentry_t l3e;
850 l2_pgentry_t l2e;
851 unsigned long spfn, epfn;
853 spfn = info->spfn;
854 epfn = info->epfn;
856 sva = (unsigned long)pdx_to_page(pfn_to_pdx(spfn));
857 eva = (unsigned long)pdx_to_page(pfn_to_pdx(epfn));
859 /* Intialize all page */
860 memset(mfn_to_page(spfn), -1,
861 (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
863 while (sva < eva)
864 {
865 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(sva)])[
866 l3_table_offset(sva)];
867 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
868 (l3e_get_flags(l3e) & _PAGE_PSE) )
869 {
870 sva = (sva & ~((1UL << L3_PAGETABLE_SHIFT) - 1)) +
871 (1UL << L3_PAGETABLE_SHIFT);
872 continue;
873 }
875 l2e = l3e_to_l2e(l3e)[l2_table_offset(sva)];
876 ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
878 if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) ==
879 (_PAGE_PSE | _PAGE_PRESENT) )
880 {
881 if (hotadd_mem_valid(l2e_get_pfn(l2e), info))
882 destroy_xen_mappings(sva & ~((1UL << L2_PAGETABLE_SHIFT) - 1),
883 ((sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
884 (1UL << L2_PAGETABLE_SHIFT) - 1));
886 sva = (sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
887 (1UL << L2_PAGETABLE_SHIFT);
888 continue;
889 }
891 ASSERT(l1e_get_flags(l2e_to_l1e(l2e)[l1_table_offset(sva)]) &
892 _PAGE_PRESENT);
893 sva = (sva & ~((1UL << PAGE_SHIFT) - 1)) +
894 (1UL << PAGE_SHIFT);
895 }
897 /* Brute-Force flush all TLB */
898 flush_tlb_all();
899 }
901 /* Should we be paraniod failure in map_pages_to_xen? */
902 static int setup_frametable_chunk(void *start, void *end,
903 struct mem_hotadd_info *info)
904 {
905 unsigned long s = (unsigned long)start;
906 unsigned long e = (unsigned long)end;
907 unsigned long mfn;
909 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
910 ASSERT(!(e & ((1 << L2_PAGETABLE_SHIFT) - 1)));
912 for ( ; s < e; s += (1UL << L2_PAGETABLE_SHIFT))
913 {
914 mfn = alloc_hotadd_mfn(info);
915 map_pages_to_xen(s, mfn, 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR);
916 }
917 memset(start, -1, s - (unsigned long)start);
919 return 0;
920 }
922 int extend_frame_table(struct mem_hotadd_info *info)
923 {
924 unsigned long cidx, nidx, eidx, spfn, epfn;
926 spfn = info->spfn;
927 epfn = info->epfn;
929 eidx = (pfn_to_pdx(epfn) + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
930 nidx = cidx = pfn_to_pdx(spfn)/PDX_GROUP_COUNT;
932 ASSERT( pfn_to_pdx(epfn) <= (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
933 (pfn_to_pdx(epfn) <= FRAMETABLE_SIZE / sizeof(struct page_info)) );
935 if ( test_bit(cidx, pdx_group_valid) )
936 cidx = find_next_zero_bit(pdx_group_valid, eidx, cidx);
938 if ( cidx >= eidx )
939 return 0;
941 while ( cidx < eidx )
942 {
943 nidx = find_next_bit(pdx_group_valid, eidx, cidx);
944 if ( nidx >= eidx )
945 nidx = eidx;
946 setup_frametable_chunk(pdx_to_page(cidx * PDX_GROUP_COUNT ),
947 pdx_to_page(nidx * PDX_GROUP_COUNT),
948 info);
950 cidx = find_next_zero_bit(pdx_group_valid, eidx, nidx);
951 }
953 memset(mfn_to_page(spfn), 0,
954 (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
955 return 0;
956 }
958 void __init subarch_init_memory(void)
959 {
960 unsigned long i, n, v, m2p_start_mfn;
961 l3_pgentry_t l3e;
962 l2_pgentry_t l2e;
964 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
965 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
966 /* M2P table is mappable read-only by privileged domains. */
967 for ( v = RDWR_MPT_VIRT_START;
968 v != RDWR_MPT_VIRT_END;
969 v += n << PAGE_SHIFT )
970 {
971 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
972 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
973 l3_table_offset(v)];
974 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
975 continue;
976 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
977 {
978 n = L1_PAGETABLE_ENTRIES;
979 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
980 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
981 continue;
982 m2p_start_mfn = l2e_get_pfn(l2e);
983 }
984 else
985 {
986 m2p_start_mfn = l3e_get_pfn(l3e);
987 }
989 for ( i = 0; i < n; i++ )
990 {
991 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
992 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
993 }
994 }
996 for ( v = RDWR_COMPAT_MPT_VIRT_START;
997 v != RDWR_COMPAT_MPT_VIRT_END;
998 v += 1 << L2_PAGETABLE_SHIFT )
999 {
1000 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
1001 l3_table_offset(v)];
1002 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1003 continue;
1004 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
1005 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1006 continue;
1007 m2p_start_mfn = l2e_get_pfn(l2e);
1009 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1011 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
1012 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
1017 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
1019 struct xen_machphys_mfn_list xmml;
1020 l3_pgentry_t l3e;
1021 l2_pgentry_t l2e;
1022 unsigned long v;
1023 xen_pfn_t mfn, last_mfn;
1024 unsigned int i;
1025 long rc = 0;
1027 switch ( op )
1029 case XENMEM_machphys_mfn_list:
1030 if ( copy_from_guest(&xmml, arg, 1) )
1031 return -EFAULT;
1033 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
1034 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
1035 for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0;
1036 (i != xmml.max_extents) &&
1037 (v < (unsigned long)(machine_to_phys_mapping + max_page));
1038 i++, v += 1UL << L2_PAGETABLE_SHIFT )
1040 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
1041 l3_table_offset(v)];
1042 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1043 mfn = last_mfn;
1044 else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
1046 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
1047 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
1048 mfn = l2e_get_pfn(l2e);
1049 else
1050 mfn = last_mfn;
1052 else
1054 mfn = l3e_get_pfn(l3e)
1055 + (l2_table_offset(v) << PAGETABLE_ORDER);
1057 ASSERT(mfn);
1058 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
1059 return -EFAULT;
1060 last_mfn = mfn;
1063 xmml.nr_extents = i;
1064 if ( copy_to_guest(arg, &xmml, 1) )
1065 return -EFAULT;
1067 break;
1069 default:
1070 rc = -ENOSYS;
1071 break;
1074 return rc;
1077 long do_stack_switch(unsigned long ss, unsigned long esp)
1079 fixup_guest_stack_selector(current->domain, ss);
1080 current->arch.guest_context.kernel_ss = ss;
1081 current->arch.guest_context.kernel_sp = esp;
1082 return 0;
1085 long do_set_segment_base(unsigned int which, unsigned long base)
1087 struct vcpu *v = current;
1088 long ret = 0;
1090 switch ( which )
1092 case SEGBASE_FS:
1093 if ( wrmsr_safe(MSR_FS_BASE, base) )
1094 ret = -EFAULT;
1095 else
1096 v->arch.guest_context.fs_base = base;
1097 break;
1099 case SEGBASE_GS_USER:
1100 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base) )
1101 ret = -EFAULT;
1102 else
1103 v->arch.guest_context.gs_base_user = base;
1104 break;
1106 case SEGBASE_GS_KERNEL:
1107 if ( wrmsr_safe(MSR_GS_BASE, base) )
1108 ret = -EFAULT;
1109 else
1110 v->arch.guest_context.gs_base_kernel = base;
1111 break;
1113 case SEGBASE_GS_USER_SEL:
1114 __asm__ __volatile__ (
1115 " swapgs \n"
1116 "1: movl %k0,%%gs \n"
1117 " "safe_swapgs" \n"
1118 ".section .fixup,\"ax\" \n"
1119 "2: xorl %k0,%k0 \n"
1120 " jmp 1b \n"
1121 ".previous \n"
1122 _ASM_EXTABLE(1b, 2b)
1123 : : "r" (base&0xffff) );
1124 break;
1126 default:
1127 ret = -EINVAL;
1128 break;
1131 return ret;
1135 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
1136 int check_descriptor(const struct domain *dom, struct desc_struct *d)
1138 u32 a = d->a, b = d->b;
1139 u16 cs;
1140 unsigned int dpl;
1142 /* A not-present descriptor will always fault, so is safe. */
1143 if ( !(b & _SEGMENT_P) )
1144 goto good;
1146 /* Check and fix up the DPL. */
1147 dpl = (b >> 13) & 3;
1148 __fixup_guest_selector(dom, dpl);
1149 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
1151 /* All code and data segments are okay. No base/limit checking. */
1152 if ( (b & _SEGMENT_S) )
1154 if ( is_pv_32bit_domain(dom) )
1156 unsigned long base, limit;
1158 if ( b & _SEGMENT_L )
1159 goto bad;
1161 /*
1162 * Older PAE Linux guests use segments which are limited to
1163 * 0xf6800000. Extend these to allow access to the larger read-only
1164 * M2P table available in 32on64 mode.
1165 */
1166 base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16);
1168 limit = (b & 0xf0000) | (a & 0xffff);
1169 limit++; /* We add one because limit is inclusive. */
1171 if ( (b & _SEGMENT_G) )
1172 limit <<= 12;
1174 if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
1176 a |= 0x0000ffff;
1177 b |= 0x000f0000;
1181 goto good;
1184 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
1185 if ( (b & _SEGMENT_TYPE) == 0x000 )
1186 goto good;
1188 /* Everything but a call gate is discarded here. */
1189 if ( (b & _SEGMENT_TYPE) != 0xc00 )
1190 goto bad;
1192 /* Validate the target code selector. */
1193 cs = a >> 16;
1194 if ( !guest_gate_selector_okay(dom, cs) )
1195 goto bad;
1196 /*
1197 * Force DPL to zero, causing a GP fault with its error code indicating
1198 * the gate in use, allowing emulation. This is necessary because with
1199 * native guests (kernel in ring 3) call gates cannot be used directly
1200 * to transition from user to kernel mode (and whether a gate is used
1201 * to enter the kernel can only be determined when the gate is being
1202 * used), and with compat guests call gates cannot be used at all as
1203 * there are only 64-bit ones.
1204 * Store the original DPL in the selector's RPL field.
1205 */
1206 b &= ~_SEGMENT_DPL;
1207 cs = (cs & ~3) | dpl;
1208 a = (a & 0xffffU) | (cs << 16);
1210 /* Reserved bits must be zero. */
1211 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
1212 goto bad;
1214 good:
1215 d->a = a;
1216 d->b = b;
1217 return 1;
1218 bad:
1219 return 0;
1222 int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs)
1224 struct domain *d = current->domain;
1226 return mem_hotplug && guest_mode(regs) && is_pv_32bit_domain(d) &&
1227 (addr >= HYPERVISOR_COMPAT_VIRT_START(d)) &&
1228 (addr < MACH2PHYS_COMPAT_VIRT_END);
1231 int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
1233 struct domain *d = current->domain;
1234 l4_pgentry_t *pl4e = NULL;
1235 l4_pgentry_t l4e;
1236 l3_pgentry_t *pl3e = NULL;
1237 l3_pgentry_t l3e;
1238 l2_pgentry_t *pl2e = NULL;
1239 l2_pgentry_t l2e, idle_l2e;
1240 unsigned long mfn, idle_index;
1241 int ret = 0;
1243 if (!is_pv_32on64_domain(d))
1244 return 0;
1246 if ( (addr < HYPERVISOR_COMPAT_VIRT_START(d)) ||
1247 (addr >= MACH2PHYS_COMPAT_VIRT_END) )
1248 return 0;
1250 mfn = (read_cr3()) >> PAGE_SHIFT;
1252 pl4e = map_domain_page(mfn);
1254 l4e = pl4e[0];
1256 if (!(l4e_get_flags(l4e) & _PAGE_PRESENT))
1257 goto unmap;
1259 mfn = l4e_get_pfn(l4e);
1260 /* We don't need get page type here since it is current CR3 */
1261 pl3e = map_domain_page(mfn);
1263 l3e = pl3e[3];
1265 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1266 goto unmap;
1268 mfn = l3e_get_pfn(l3e);
1269 pl2e = map_domain_page(mfn);
1271 l2e = pl2e[l2_table_offset(addr)];
1273 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT))
1274 goto unmap;
1276 idle_index = (l2_table_offset(addr) -
1277 COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d))/
1278 sizeof(l2_pgentry_t);
1279 idle_l2e = compat_idle_pg_table_l2[idle_index];
1280 if (!(l2e_get_flags(idle_l2e) & _PAGE_PRESENT))
1281 goto unmap;
1283 memcpy(&pl2e[l2_table_offset(addr)],
1284 &compat_idle_pg_table_l2[idle_index],
1285 sizeof(l2_pgentry_t));
1287 ret = EXCRET_fault_fixed;
1289 unmap:
1290 if ( pl4e )
1291 unmap_domain_page(pl4e);
1292 if ( pl3e )
1293 unmap_domain_page(pl3e);
1294 if ( pl2e )
1295 unmap_domain_page(pl2e);
1297 return ret;
1300 void domain_set_alloc_bitsize(struct domain *d)
1302 if ( !is_pv_32on64_domain(d) ||
1303 (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
1304 d->arch.physaddr_bitsize > 0 )
1305 return;
1306 d->arch.physaddr_bitsize =
1307 /* 2^n entries can be contained in guest's p2m mapping space */
1308 fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
1309 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
1310 + PAGE_SHIFT;
1313 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
1315 if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
1316 return bits;
1317 return min(d->arch.physaddr_bitsize, bits);
1320 int transfer_pages_to_heap(struct mem_hotadd_info *info)
1322 unsigned long i;
1323 struct page_info *pg;
1325 /*
1326 * Mark the allocated page before put free pages to buddy allocator
1327 * to avoid merge in free_heap_pages
1328 */
1329 for (i = info->spfn; i < info->cur; i++)
1331 pg = mfn_to_page(i);
1332 pg->count_info = PGC_state_inuse;
1335 init_domheap_pages(pfn_to_paddr(info->cur), pfn_to_paddr(info->epfn));
1337 return 0;
1340 int mem_hotadd_check(unsigned long spfn, unsigned long epfn)
1342 unsigned long s, e, length, sidx, eidx;
1344 if ( (spfn >= epfn) )
1345 return 0;
1347 if (pfn_to_pdx(epfn) > (FRAMETABLE_SIZE / sizeof(*frame_table)))
1348 return 0;
1350 if ( (spfn | epfn) & ((1UL << PAGETABLE_ORDER) - 1) )
1351 return 0;
1353 if ( (spfn | epfn) & pfn_hole_mask )
1354 return 0;
1356 /* Make sure the new range is not present now */
1357 sidx = ((pfn_to_pdx(spfn) + PDX_GROUP_COUNT - 1) & ~(PDX_GROUP_COUNT - 1))
1358 / PDX_GROUP_COUNT;
1359 eidx = (pfn_to_pdx(epfn - 1) & ~(PDX_GROUP_COUNT - 1)) / PDX_GROUP_COUNT;
1360 if (sidx >= eidx)
1361 return 0;
1363 s = find_next_zero_bit(pdx_group_valid, eidx, sidx);
1364 if ( s > eidx )
1365 return 0;
1366 e = find_next_bit(pdx_group_valid, eidx, s);
1367 if ( e < eidx )
1368 return 0;
1370 /* Caculate at most required m2p/compat m2p/frametable pages */
1371 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1));
1372 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 3)) - 1) &
1373 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1);
1375 length = (e - s) * sizeof(unsigned long);
1377 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1));
1378 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) &
1379 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1);
1381 e = min_t(unsigned long, e,
1382 (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2);
1384 if ( e > s )
1385 length += (e -s) * sizeof(unsigned int);
1387 s = pfn_to_pdx(spfn) & ~(PDX_GROUP_COUNT - 1);
1388 e = ( pfn_to_pdx(epfn) + (PDX_GROUP_COUNT - 1) ) & ~(PDX_GROUP_COUNT - 1);
1390 length += (e - s) * sizeof(struct page_info);
1392 if ((length >> PAGE_SHIFT) > (epfn - spfn))
1393 return 0;
1395 return 1;
1398 /*
1399 * A bit paranoid for memory allocation failure issue since
1400 * it may be reason for memory add
1401 */
1402 int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
1404 struct mem_hotadd_info info;
1405 int ret, node;
1406 unsigned long old_max = max_page, old_total = total_pages;
1407 unsigned long old_node_start, old_node_span, orig_online;
1408 unsigned long i;
1410 dprintk(XENLOG_INFO, "memory_add %lx ~ %lx with pxm %x\n", spfn, epfn, pxm);
1412 if ( !mem_hotadd_check(spfn, epfn) )
1413 return -EINVAL;
1415 if ( (node = setup_node(pxm)) == -1 )
1416 return -EINVAL;
1418 if ( !valid_numa_range(spfn << PAGE_SHIFT, epfn << PAGE_SHIFT, node) )
1420 dprintk(XENLOG_WARNING, "spfn %lx ~ epfn %lx pxm %x node %x"
1421 "is not numa valid", spfn, epfn, pxm, node);
1422 return -EINVAL;
1425 ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), spfn,
1426 epfn - spfn, PAGE_HYPERVISOR);
1427 if ( ret )
1428 return ret;
1430 old_node_start = NODE_DATA(node)->node_start_pfn;
1431 old_node_span = NODE_DATA(node)->node_spanned_pages;
1432 orig_online = node_online(node);
1434 if ( !orig_online )
1436 dprintk(XENLOG_WARNING, "node %x pxm %x is not online\n",node, pxm);
1437 NODE_DATA(node)->node_id = node;
1438 NODE_DATA(node)->node_start_pfn = spfn;
1439 NODE_DATA(node)->node_spanned_pages =
1440 epfn - node_start_pfn(node);
1441 node_set_online(node);
1442 }else
1444 if (NODE_DATA(node)->node_start_pfn > spfn)
1445 NODE_DATA(node)->node_start_pfn = spfn;
1446 if (node_end_pfn(node) < epfn)
1447 NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node);
1450 ret = -EINVAL;
1451 info.spfn = spfn;
1452 info.epfn = epfn;
1453 info.cur = spfn;
1455 ret = extend_frame_table(&info);
1456 if (ret)
1457 goto destroy_frametable;
1459 /* Set max_page as setup_m2p_table will use it*/
1460 if (max_page < epfn)
1462 max_page = epfn;
1463 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1465 total_pages += epfn - spfn;
1467 set_pdx_range(spfn, epfn);
1468 ret = setup_m2p_table(&info);
1470 if ( ret )
1471 goto destroy_m2p;
1473 if ( !need_iommu(dom0) )
1475 for ( i = spfn; i < epfn; i++ )
1476 if ( iommu_map_page(dom0, i, i, IOMMUF_readable|IOMMUF_writable) )
1477 break;
1478 if ( i != epfn )
1480 while (i-- > old_max)
1481 iommu_unmap_page(dom0, i);
1482 goto destroy_m2p;
1486 /* We can't revert any more */
1487 transfer_pages_to_heap(&info);
1489 share_hotadd_m2p_table(&info);
1491 return 0;
1493 destroy_m2p:
1494 destroy_m2p_mapping(&info);
1495 max_page = old_max;
1496 total_pages = old_total;
1497 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1498 destroy_frametable:
1499 cleanup_frame_table(&info);
1500 destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1501 (unsigned long)mfn_to_virt(epfn));
1503 if ( !orig_online )
1504 node_set_offline(node);
1505 NODE_DATA(node)->node_start_pfn = old_node_start;
1506 NODE_DATA(node)->node_spanned_pages = old_node_span;
1508 destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1509 (unsigned long)mfn_to_virt(epfn));
1510 return ret;
1513 #include "compat/mm.c"
1515 /*
1516 * Local variables:
1517 * mode: C
1518 * c-set-style: "BSD"
1519 * c-basic-offset: 4
1520 * tab-width: 4
1521 * indent-tabs-mode: nil
1522 * End:
1523 */