debuggers.hg

view xen/arch/x86/x86_64/mm.c @ 21043:b8d2a4134a68

x86-64: fix hotplug fault handling for 32-bit domains' M2P range

- handle only when memory hotplug regions were actually found
- fix off-by-one error in fault handler's sanity checking
- use first L4 table entry

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Mar 03 17:41:58 2010 +0000 (2010-03-03)
parents 217f6aa87716
children c1b6647c6828
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/numa.h>
26 #include <xen/nodemask.h>
27 #include <xen/guest_access.h>
28 #include <asm/current.h>
29 #include <asm/asm_defns.h>
30 #include <asm/page.h>
31 #include <asm/flushtlb.h>
32 #include <asm/fixmap.h>
33 #include <asm/hypercall.h>
34 #include <asm/msr.h>
35 #include <asm/setup.h>
36 #include <asm/numa.h>
37 #include <public/memory.h>
39 /* Parameters for PFN/MADDR compression. */
40 unsigned long __read_mostly max_pdx;
41 unsigned long __read_mostly pfn_pdx_bottom_mask = ~0UL;
42 unsigned long __read_mostly ma_va_bottom_mask = ~0UL;
43 unsigned long __read_mostly pfn_top_mask = 0;
44 unsigned long __read_mostly ma_top_mask = 0;
45 unsigned long __read_mostly pfn_hole_mask = 0;
46 unsigned int __read_mostly pfn_pdx_hole_shift = 0;
48 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
50 DEFINE_PER_CPU_READ_MOSTLY(void *, compat_arg_xlat);
52 /* Top-level master (and idle-domain) page directory. */
53 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
54 idle_pg_table[L4_PAGETABLE_ENTRIES];
56 /* Enough page directories to map bottom 4GB of the memory map. */
57 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
58 l3_identmap[L3_PAGETABLE_ENTRIES];
59 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
60 l2_identmap[4*L2_PAGETABLE_ENTRIES];
62 /* Enough page directories to map the Xen text and static data. */
63 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
64 l3_xenmap[L3_PAGETABLE_ENTRIES];
65 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
66 l2_xenmap[L2_PAGETABLE_ENTRIES];
68 int __mfn_valid(unsigned long mfn)
69 {
70 return likely(mfn < max_page) &&
71 likely(!(mfn & pfn_hole_mask)) &&
72 likely(test_bit(pfn_to_pdx(mfn) / PDX_GROUP_COUNT,
73 pdx_group_valid));
74 }
76 void *alloc_xen_pagetable(void)
77 {
78 unsigned long mfn;
80 if ( !early_boot )
81 {
82 struct page_info *pg = alloc_domheap_page(NULL, 0);
83 BUG_ON(pg == NULL);
84 return page_to_virt(pg);
85 }
87 mfn = alloc_boot_pages(1, 1);
88 return mfn_to_virt(mfn);
89 }
91 l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
92 {
93 l4_pgentry_t *pl4e;
95 pl4e = &idle_pg_table[l4_table_offset(v)];
96 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
97 {
98 l3_pgentry_t *pl3e = alloc_xen_pagetable();
99 clear_page(pl3e);
100 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
101 }
103 return l4e_to_l3e(*pl4e) + l3_table_offset(v);
104 }
106 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
107 {
108 l3_pgentry_t *pl3e;
110 pl3e = virt_to_xen_l3e(v);
111 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
112 {
113 l2_pgentry_t *pl2e = alloc_xen_pagetable();
114 clear_page(pl2e);
115 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
116 }
118 BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
119 return l3e_to_l2e(*pl3e) + l2_table_offset(v);
120 }
122 void *do_page_walk(struct vcpu *v, unsigned long addr)
123 {
124 unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
125 l4_pgentry_t l4e, *l4t;
126 l3_pgentry_t l3e, *l3t;
127 l2_pgentry_t l2e, *l2t;
128 l1_pgentry_t l1e, *l1t;
130 if ( is_hvm_vcpu(v) )
131 return NULL;
133 l4t = mfn_to_virt(mfn);
134 l4e = l4t[l4_table_offset(addr)];
135 mfn = l4e_get_pfn(l4e);
136 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
137 return NULL;
139 l3t = mfn_to_virt(mfn);
140 l3e = l3t[l3_table_offset(addr)];
141 mfn = l3e_get_pfn(l3e);
142 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
143 return NULL;
144 if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
145 return mfn_to_virt(mfn) + (addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
147 l2t = mfn_to_virt(mfn);
148 l2e = l2t[l2_table_offset(addr)];
149 mfn = l2e_get_pfn(l2e);
150 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
151 return NULL;
152 if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
153 return mfn_to_virt(mfn) + (addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
155 l1t = mfn_to_virt(mfn);
156 l1e = l1t[l1_table_offset(addr)];
157 mfn = l1e_get_pfn(l1e);
158 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
159 return NULL;
161 return mfn_to_virt(mfn) + (addr & ~PAGE_MASK);
162 }
164 void __init pfn_pdx_hole_setup(unsigned long mask)
165 {
166 unsigned int i, j, bottom_shift, hole_shift;
168 for ( hole_shift = bottom_shift = j = 0; ; )
169 {
170 i = find_next_zero_bit(&mask, BITS_PER_LONG, j);
171 j = find_next_bit(&mask, BITS_PER_LONG, i);
172 if ( j >= BITS_PER_LONG )
173 break;
174 if ( j - i > hole_shift )
175 {
176 hole_shift = j - i;
177 bottom_shift = i;
178 }
179 }
180 if ( !hole_shift )
181 return;
183 printk(KERN_INFO "PFN compression on bits %u...%u\n",
184 bottom_shift, bottom_shift + hole_shift - 1);
186 pfn_pdx_hole_shift = hole_shift;
187 pfn_pdx_bottom_mask = (1UL << bottom_shift) - 1;
188 ma_va_bottom_mask = (PAGE_SIZE << bottom_shift) - 1;
189 pfn_hole_mask = ((1UL << hole_shift) - 1) << bottom_shift;
190 pfn_top_mask = ~(pfn_pdx_bottom_mask | pfn_hole_mask);
191 ma_top_mask = pfn_top_mask << PAGE_SHIFT;
192 }
194 /*
195 * Allocate page table pages for m2p table
196 */
197 struct mem_hotadd_info
198 {
199 unsigned long spfn;
200 unsigned long epfn;
201 unsigned long cur;
202 };
204 int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info)
205 {
206 return (pfn < info->epfn && pfn >= info->spfn);
207 }
209 static unsigned long alloc_hotadd_mfn(struct mem_hotadd_info *info)
210 {
211 unsigned mfn;
213 ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) &&
214 info->cur >= info->spfn);
216 mfn = info->cur;
217 info->cur += (1UL << PAGETABLE_ORDER);
218 return mfn;
219 }
221 #define M2P_NO_MAPPED 0
222 #define M2P_2M_MAPPED 1
223 #define M2P_1G_MAPPED 2
224 static int m2p_mapped(unsigned long spfn)
225 {
226 unsigned long va;
227 l3_pgentry_t *l3_ro_mpt;
228 l2_pgentry_t *l2_ro_mpt;
230 va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping);
231 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
233 switch ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
234 (_PAGE_PRESENT |_PAGE_PSE))
235 {
236 case _PAGE_PSE|_PAGE_PRESENT:
237 return M2P_1G_MAPPED;
238 break;
239 /* Check for next level */
240 case _PAGE_PRESENT:
241 break;
242 default:
243 return M2P_NO_MAPPED;
244 break;
245 }
246 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
248 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
249 return M2P_2M_MAPPED;
251 return M2P_NO_MAPPED;
252 }
254 int share_hotadd_m2p_table(struct mem_hotadd_info *info)
255 {
256 unsigned long i, n, v, m2p_start_mfn = 0;
257 l3_pgentry_t l3e;
258 l2_pgentry_t l2e;
260 /* M2P table is mappable read-only by privileged domains. */
261 for ( v = RDWR_MPT_VIRT_START;
262 v != RDWR_MPT_VIRT_END;
263 v += n << PAGE_SHIFT )
264 {
265 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
266 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
267 l3_table_offset(v)];
268 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
269 continue;
270 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
271 {
272 n = L1_PAGETABLE_ENTRIES;
273 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
274 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
275 continue;
276 m2p_start_mfn = l2e_get_pfn(l2e);
277 }
278 else
279 continue;
281 for ( i = 0; i < n; i++ )
282 {
283 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
284 if (hotadd_mem_valid(m2p_start_mfn + i, info))
285 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
286 }
287 }
289 for ( v = RDWR_COMPAT_MPT_VIRT_START;
290 v != RDWR_COMPAT_MPT_VIRT_END;
291 v += 1 << L2_PAGETABLE_SHIFT )
292 {
293 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
294 l3_table_offset(v)];
295 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
296 continue;
297 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
298 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
299 continue;
300 m2p_start_mfn = l2e_get_pfn(l2e);
302 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
303 {
304 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
305 if (hotadd_mem_valid(m2p_start_mfn + i, info))
306 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
307 }
308 }
309 return 0;
310 }
312 static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info)
313 {
314 unsigned long i, va, rwva, pt_pfn;
315 unsigned long smap = info->spfn, emap = info->spfn;
317 l3_pgentry_t *l3_ro_mpt;
318 l2_pgentry_t *l2_ro_mpt;
320 if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
321 return;
323 if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
324 emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
326 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
328 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]) & _PAGE_PRESENT);
330 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
332 for ( i = smap; i < emap; )
333 {
334 va = HIRO_COMPAT_MPT_VIRT_START +
335 i * sizeof(*compat_machine_to_phys_mapping);
336 rwva = RDWR_COMPAT_MPT_VIRT_START +
337 i * sizeof(*compat_machine_to_phys_mapping);
338 if ( l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT )
339 {
340 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
341 if ( hotadd_mem_valid(pt_pfn, info) )
342 {
343 destroy_xen_mappings(rwva, rwva +
344 (1UL << L2_PAGETABLE_SHIFT));
345 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
346 }
347 }
349 i += 1UL < (L2_PAGETABLE_SHIFT - 2);
350 }
352 return;
353 }
355 void destroy_m2p_mapping(struct mem_hotadd_info *info)
356 {
357 l3_pgentry_t *l3_ro_mpt;
358 unsigned long i, va, rwva;
359 unsigned long smap = info->spfn, emap = info->epfn;
361 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
363 /*
364 * No need to clean m2p structure existing before the hotplug
365 */
366 for (i = smap; i < emap;)
367 {
368 unsigned long pt_pfn;
369 l2_pgentry_t *l2_ro_mpt;
371 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
372 rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
374 /* 1G mapping should not be created by mem hotadd */
375 if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) ||
376 (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE))
377 {
378 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
379 (1UL << (L3_PAGETABLE_SHIFT - 3) );
380 continue;
381 }
383 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
384 if (!(l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT))
385 {
386 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
387 (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
388 continue;
389 }
391 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
392 if ( hotadd_mem_valid(pt_pfn, info) )
393 {
394 destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
396 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
397 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
398 }
399 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
400 (1UL << (L2_PAGETABLE_SHIFT - 3));
401 }
403 destroy_compat_m2p_mapping(info);
405 /* Brute-Force flush all TLB */
406 flush_tlb_all();
407 return;
408 }
410 /*
411 * Allocate and map the compatibility mode machine-to-phys table.
412 * spfn/epfn: the pfn ranges to be setup
413 * free_s/free_e: the pfn ranges that is free still
414 */
415 static int setup_compat_m2p_table(struct mem_hotadd_info *info)
416 {
417 unsigned long i, va, smap, emap, rwva, epfn = info->epfn;
418 unsigned int n, memflags;
419 l3_pgentry_t *l3_ro_mpt = NULL;
420 l2_pgentry_t *l2_ro_mpt = NULL;
421 struct page_info *l1_pg;
423 smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1));
425 /*
426 * Notice: For hot-added memory, only range below m2p_compat_vstart
427 * will be filled up (assuming memory is discontinous when booting).
428 */
429 if ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) )
430 return 0;
432 if (epfn > (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START))
433 epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
435 emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) &
436 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) );
438 va = HIRO_COMPAT_MPT_VIRT_START +
439 smap * sizeof(*compat_machine_to_phys_mapping);
440 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
442 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT);
444 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
446 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
447 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
448 sizeof(*compat_machine_to_phys_mapping))
449 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
450 sizeof(*compat_machine_to_phys_mapping));
452 for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) )
453 {
454 va = HIRO_COMPAT_MPT_VIRT_START +
455 i * sizeof(*compat_machine_to_phys_mapping);
457 rwva = RDWR_COMPAT_MPT_VIRT_START +
458 i * sizeof(*compat_machine_to_phys_mapping);
460 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
461 continue;
463 for ( n = 0; n < CNT; ++n)
464 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
465 break;
466 if ( n == CNT )
467 continue;
469 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
471 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
472 map_pages_to_xen(rwva,
473 page_to_mfn(l1_pg),
474 1UL << PAGETABLE_ORDER,
475 PAGE_HYPERVISOR);
476 memset((void *)rwva, 0x55, 1UL << L2_PAGETABLE_SHIFT);
477 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
478 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
479 }
480 #undef CNT
481 #undef MFN
482 return 0;
483 }
485 /*
486 * Allocate and map the machine-to-phys table.
487 * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already
488 */
489 int setup_m2p_table(struct mem_hotadd_info *info)
490 {
491 unsigned long i, va, smap, emap;
492 unsigned int n, memflags;
493 l2_pgentry_t *l2_ro_mpt = NULL;
494 l3_pgentry_t *l3_ro_mpt = NULL;
495 struct page_info *l1_pg, *l2_pg;
496 int ret = 0;
498 ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)])
499 & _PAGE_PRESENT);
500 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
502 smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)));
503 emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) &
504 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1));
506 va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping);
508 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
509 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
510 sizeof(*machine_to_phys_mapping))
512 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
513 sizeof(*machine_to_phys_mapping));
515 i = smap;
516 while ( i < emap )
517 {
518 switch ( m2p_mapped(i) )
519 {
520 case M2P_1G_MAPPED:
521 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
522 (1UL << (L3_PAGETABLE_SHIFT - 3));
523 continue;
524 case M2P_2M_MAPPED:
525 i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
526 (1UL << (L2_PAGETABLE_SHIFT - 3));
527 continue;
528 default:
529 break;
530 }
532 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
533 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
535 for ( n = 0; n < CNT; ++n)
536 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
537 break;
538 if ( n == CNT )
539 l1_pg = NULL;
540 else
541 {
542 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
543 map_pages_to_xen(
544 RDWR_MPT_VIRT_START + i * sizeof(unsigned long),
545 page_to_mfn(l1_pg),
546 1UL << PAGETABLE_ORDER,
547 PAGE_HYPERVISOR);
548 memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)),
549 0x55, 1UL << L2_PAGETABLE_SHIFT);
551 ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
552 _PAGE_PSE));
553 if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
554 _PAGE_PRESENT )
555 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) +
556 l2_table_offset(va);
557 else
558 {
559 l2_pg = alloc_domheap_page(NULL, memflags);
561 if (!l2_pg)
562 {
563 ret = -ENOMEM;
564 goto error;
565 }
567 l2_ro_mpt = page_to_virt(l2_pg);
568 clear_page(l2_ro_mpt);
569 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
570 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
571 l2_ro_mpt += l2_table_offset(va);
572 }
574 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
575 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg,
576 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
577 }
578 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
579 l2_ro_mpt = NULL;
580 i += ( 1UL << (L2_PAGETABLE_SHIFT - 3));
581 }
582 #undef CNT
583 #undef MFN
585 ret = setup_compat_m2p_table(info);
586 error:
587 return ret;
588 }
590 void __init paging_init(void)
591 {
592 unsigned long i, mpt_size, va;
593 unsigned int n, memflags;
594 l3_pgentry_t *l3_ro_mpt;
595 l2_pgentry_t *l2_ro_mpt = NULL;
596 struct page_info *l1_pg, *l2_pg, *l3_pg;
598 /*
599 * We setup the L3s for 1:1 mapping if host support memory hotplug
600 * to avoid sync the 1:1 mapping on page fault handler
601 */
602 if ( mem_hotplug )
603 {
604 unsigned long va;
606 for ( va = DIRECTMAP_VIRT_START;
607 va < DIRECTMAP_VIRT_END;
608 va += (1UL << L4_PAGETABLE_SHIFT) )
609 {
610 if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
611 _PAGE_PRESENT) )
612 {
613 l3_pg = alloc_domheap_page(NULL, 0);
614 if ( !l3_pg )
615 goto nomem;
616 l3_ro_mpt = page_to_virt(l3_pg);
617 clear_page(l3_ro_mpt);
618 l4e_write(&idle_pg_table[l4_table_offset(va)],
619 l4e_from_page(l3_pg, __PAGE_HYPERVISOR));
620 }
621 }
622 }
624 /* Create user-accessible L2 directory to map the MPT for guests. */
625 if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL )
626 goto nomem;
627 l3_ro_mpt = page_to_virt(l3_pg);
628 clear_page(l3_ro_mpt);
629 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
630 l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER));
632 /*
633 * Allocate and map the machine-to-phys table.
634 * This also ensures L3 is present for fixmaps.
635 */
636 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
637 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
638 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
639 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
640 sizeof(*machine_to_phys_mapping))
641 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
642 sizeof(*machine_to_phys_mapping));
643 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
644 {
645 BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
646 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
647 memflags = MEMF_node(phys_to_nid(i <<
648 (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
650 if ( cpu_has_page1gb &&
651 !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
652 (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) )
653 {
654 unsigned int k, holes;
656 for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
657 {
658 for ( n = 0; n < CNT; ++n)
659 if ( mfn_valid(MFN(i + k) + n * PDX_GROUP_COUNT) )
660 break;
661 if ( n == CNT )
662 ++holes;
663 }
664 if ( k == holes )
665 {
666 i += (1UL << PAGETABLE_ORDER) - 1;
667 continue;
668 }
669 if ( holes == 0 &&
670 (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
671 memflags)) != NULL )
672 {
673 map_pages_to_xen(
674 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
675 page_to_mfn(l1_pg),
676 1UL << (2 * PAGETABLE_ORDER),
677 PAGE_HYPERVISOR);
678 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
679 0x77, 1UL << L3_PAGETABLE_SHIFT);
681 ASSERT(!l2_table_offset(va));
682 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
683 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
684 l3e_from_page(l1_pg,
685 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
686 i += (1UL << PAGETABLE_ORDER) - 1;
687 continue;
688 }
689 }
691 for ( n = 0; n < CNT; ++n)
692 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
693 break;
694 if ( n == CNT )
695 l1_pg = NULL;
696 else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
697 memflags)) == NULL )
698 goto nomem;
699 else
700 {
701 map_pages_to_xen(
702 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
703 page_to_mfn(l1_pg),
704 1UL << PAGETABLE_ORDER,
705 PAGE_HYPERVISOR);
706 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
707 0x55, 1UL << L2_PAGETABLE_SHIFT);
708 }
709 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
710 {
711 if ( (l2_pg = alloc_domheap_page(NULL, memflags)) == NULL )
712 goto nomem;
713 l2_ro_mpt = page_to_virt(l2_pg);
714 clear_page(l2_ro_mpt);
715 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
716 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
717 ASSERT(!l2_table_offset(va));
718 }
719 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
720 if ( l1_pg )
721 l2e_write(l2_ro_mpt, l2e_from_page(
722 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
723 l2_ro_mpt++;
724 }
725 #undef CNT
726 #undef MFN
728 /* Create user-accessible L2 directory to map the MPT for compat guests. */
729 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
730 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
731 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
732 HIRO_COMPAT_MPT_VIRT_START)]);
733 if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
734 goto nomem;
735 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
736 clear_page(l2_ro_mpt);
737 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
738 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
739 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
740 /* Allocate and map the compatibility mode machine-to-phys table. */
741 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
742 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
743 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
744 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
745 if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
746 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
747 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
748 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
749 sizeof(*compat_machine_to_phys_mapping))
750 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
751 sizeof(*compat_machine_to_phys_mapping));
752 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ )
753 {
754 memflags = MEMF_node(phys_to_nid(i <<
755 (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
756 for ( n = 0; n < CNT; ++n)
757 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
758 break;
759 if ( n == CNT )
760 continue;
761 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
762 memflags)) == NULL )
763 goto nomem;
764 map_pages_to_xen(
765 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
766 page_to_mfn(l1_pg),
767 1UL << PAGETABLE_ORDER,
768 PAGE_HYPERVISOR);
769 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
770 (i << L2_PAGETABLE_SHIFT)),
771 0x55,
772 1UL << L2_PAGETABLE_SHIFT);
773 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
774 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
775 }
776 #undef CNT
777 #undef MFN
779 /* Set up linear page table mapping. */
780 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
781 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
782 return;
784 nomem:
785 panic("Not enough memory for m2p table\n");
786 }
788 void __init setup_idle_pagetable(void)
789 {
790 /* Install per-domain mappings for idle domain. */
791 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
792 l4e_from_page(
793 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
794 __PAGE_HYPERVISOR));
795 }
797 void __init zap_low_mappings(void)
798 {
799 BUG_ON(num_online_cpus() != 1);
801 /* Remove aliased mapping of first 1:1 PML4 entry. */
802 l4e_write(&idle_pg_table[0], l4e_empty());
803 flush_local(FLUSH_TLB_GLOBAL);
805 /* Replace with mapping of the boot trampoline only. */
806 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
807 0x10, __PAGE_HYPERVISOR);
808 }
810 int __cpuinit setup_compat_arg_xlat(unsigned int cpu, int node)
811 {
812 unsigned int order = get_order_from_bytes(COMPAT_ARG_XLAT_SIZE);
813 unsigned long sz = PAGE_SIZE << order;
814 unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
815 struct page_info *pg;
817 pg = alloc_domheap_pages(NULL, order, memflags);
818 if ( !pg )
819 return -ENOMEM;
821 for ( ; (sz -= PAGE_SIZE) >= COMPAT_ARG_XLAT_SIZE; ++pg )
822 free_domheap_page(pg);
824 per_cpu(compat_arg_xlat, cpu) = page_to_virt(pg);
826 return 0;
827 }
829 void cleanup_frame_table(struct mem_hotadd_info *info)
830 {
831 unsigned long sva, eva;
832 l3_pgentry_t l3e;
833 l2_pgentry_t l2e;
834 unsigned long spfn, epfn;
836 spfn = info->spfn;
837 epfn = info->epfn;
839 sva = (unsigned long)pdx_to_page(pfn_to_pdx(spfn));
840 eva = (unsigned long)pdx_to_page(pfn_to_pdx(epfn));
842 /* Intialize all page */
843 memset(mfn_to_page(spfn), -1,
844 (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
846 while (sva < eva)
847 {
848 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(sva)])[
849 l3_table_offset(sva)];
850 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
851 (l3e_get_flags(l3e) & _PAGE_PSE) )
852 {
853 sva = (sva & ~((1UL << L3_PAGETABLE_SHIFT) - 1)) +
854 (1UL << L3_PAGETABLE_SHIFT);
855 continue;
856 }
858 l2e = l3e_to_l2e(l3e)[l2_table_offset(sva)];
859 ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
861 if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) ==
862 (_PAGE_PSE | _PAGE_PRESENT) )
863 {
864 if (hotadd_mem_valid(l2e_get_pfn(l2e), info))
865 destroy_xen_mappings(sva & ~((1UL << L2_PAGETABLE_SHIFT) - 1),
866 ((sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
867 (1UL << L2_PAGETABLE_SHIFT) - 1));
869 sva = (sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
870 (1UL << L2_PAGETABLE_SHIFT);
871 continue;
872 }
874 ASSERT(l1e_get_flags(l2e_to_l1e(l2e)[l1_table_offset(sva)]) &
875 _PAGE_PRESENT);
876 sva = (sva & ~((1UL << PAGE_SHIFT) - 1)) +
877 (1UL << PAGE_SHIFT);
878 }
880 /* Brute-Force flush all TLB */
881 flush_tlb_all();
882 }
884 /* Should we be paraniod failure in map_pages_to_xen? */
885 static int setup_frametable_chunk(void *start, void *end,
886 struct mem_hotadd_info *info)
887 {
888 unsigned long s = (unsigned long)start;
889 unsigned long e = (unsigned long)end;
890 unsigned long mfn;
892 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
893 ASSERT(!(e & ((1 << L2_PAGETABLE_SHIFT) - 1)));
895 for ( ; s < e; s += (1UL << L2_PAGETABLE_SHIFT))
896 {
897 mfn = alloc_hotadd_mfn(info);
898 map_pages_to_xen(s, mfn, 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR);
899 }
900 memset(start, -1, s - (unsigned long)start);
902 return 0;
903 }
905 int extend_frame_table(struct mem_hotadd_info *info)
906 {
907 unsigned long cidx, nidx, eidx, spfn, epfn;
909 spfn = info->spfn;
910 epfn = info->epfn;
912 eidx = (pfn_to_pdx(epfn) + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
913 nidx = cidx = pfn_to_pdx(spfn)/PDX_GROUP_COUNT;
915 ASSERT( pfn_to_pdx(epfn) <= (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
916 (pfn_to_pdx(epfn) <= FRAMETABLE_SIZE / sizeof(struct page_info)) );
918 if ( test_bit(cidx, pdx_group_valid) )
919 cidx = find_next_zero_bit(pdx_group_valid, eidx, cidx);
921 if ( cidx >= eidx )
922 return 0;
924 while ( cidx < eidx )
925 {
926 nidx = find_next_bit(pdx_group_valid, eidx, cidx);
927 if ( nidx >= eidx )
928 nidx = eidx;
929 setup_frametable_chunk(pdx_to_page(cidx * PDX_GROUP_COUNT ),
930 pdx_to_page(nidx * PDX_GROUP_COUNT),
931 info);
933 cidx = find_next_zero_bit(pdx_group_valid, eidx, nidx);
934 }
936 memset(mfn_to_page(spfn), 0,
937 (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
938 return 0;
939 }
941 void __init subarch_init_memory(void)
942 {
943 unsigned long i, n, v, m2p_start_mfn;
944 l3_pgentry_t l3e;
945 l2_pgentry_t l2e;
947 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
948 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
949 /* M2P table is mappable read-only by privileged domains. */
950 for ( v = RDWR_MPT_VIRT_START;
951 v != RDWR_MPT_VIRT_END;
952 v += n << PAGE_SHIFT )
953 {
954 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
955 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
956 l3_table_offset(v)];
957 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
958 continue;
959 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
960 {
961 n = L1_PAGETABLE_ENTRIES;
962 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
963 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
964 continue;
965 m2p_start_mfn = l2e_get_pfn(l2e);
966 }
967 else
968 {
969 m2p_start_mfn = l3e_get_pfn(l3e);
970 }
972 for ( i = 0; i < n; i++ )
973 {
974 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
975 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
976 }
977 }
979 for ( v = RDWR_COMPAT_MPT_VIRT_START;
980 v != RDWR_COMPAT_MPT_VIRT_END;
981 v += 1 << L2_PAGETABLE_SHIFT )
982 {
983 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
984 l3_table_offset(v)];
985 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
986 continue;
987 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
988 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
989 continue;
990 m2p_start_mfn = l2e_get_pfn(l2e);
992 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
993 {
994 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
995 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
996 }
997 }
999 if ( setup_compat_arg_xlat(smp_processor_id(),
1000 cpu_to_node[0]) )
1001 panic("Could not setup argument translation area");
1004 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
1006 struct xen_machphys_mfn_list xmml;
1007 l3_pgentry_t l3e;
1008 l2_pgentry_t l2e;
1009 unsigned long v;
1010 xen_pfn_t mfn, last_mfn;
1011 unsigned int i;
1012 long rc = 0;
1014 switch ( op )
1016 case XENMEM_machphys_mfn_list:
1017 if ( copy_from_guest(&xmml, arg, 1) )
1018 return -EFAULT;
1020 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
1021 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
1022 for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0;
1023 (i != xmml.max_extents) &&
1024 (v < (unsigned long)(machine_to_phys_mapping + max_page));
1025 i++, v += 1UL << L2_PAGETABLE_SHIFT )
1027 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
1028 l3_table_offset(v)];
1029 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1030 mfn = last_mfn;
1031 else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
1033 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
1034 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
1035 mfn = l2e_get_pfn(l2e);
1036 else
1037 mfn = last_mfn;
1039 else
1041 mfn = l3e_get_pfn(l3e)
1042 + (l2_table_offset(v) << PAGETABLE_ORDER);
1044 ASSERT(mfn);
1045 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
1046 return -EFAULT;
1047 last_mfn = mfn;
1050 xmml.nr_extents = i;
1051 if ( copy_to_guest(arg, &xmml, 1) )
1052 return -EFAULT;
1054 break;
1056 default:
1057 rc = -ENOSYS;
1058 break;
1061 return rc;
1064 long do_stack_switch(unsigned long ss, unsigned long esp)
1066 fixup_guest_stack_selector(current->domain, ss);
1067 current->arch.guest_context.kernel_ss = ss;
1068 current->arch.guest_context.kernel_sp = esp;
1069 return 0;
1072 long do_set_segment_base(unsigned int which, unsigned long base)
1074 struct vcpu *v = current;
1075 long ret = 0;
1077 switch ( which )
1079 case SEGBASE_FS:
1080 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
1081 ret = -EFAULT;
1082 else
1083 v->arch.guest_context.fs_base = base;
1084 break;
1086 case SEGBASE_GS_USER:
1087 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
1088 ret = -EFAULT;
1089 else
1090 v->arch.guest_context.gs_base_user = base;
1091 break;
1093 case SEGBASE_GS_KERNEL:
1094 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
1095 ret = -EFAULT;
1096 else
1097 v->arch.guest_context.gs_base_kernel = base;
1098 break;
1100 case SEGBASE_GS_USER_SEL:
1101 __asm__ __volatile__ (
1102 " swapgs \n"
1103 "1: movl %k0,%%gs \n"
1104 " "safe_swapgs" \n"
1105 ".section .fixup,\"ax\" \n"
1106 "2: xorl %k0,%k0 \n"
1107 " jmp 1b \n"
1108 ".previous \n"
1109 ".section __ex_table,\"a\"\n"
1110 " .align 8 \n"
1111 " .quad 1b,2b \n"
1112 ".previous "
1113 : : "r" (base&0xffff) );
1114 break;
1116 default:
1117 ret = -EINVAL;
1118 break;
1121 return ret;
1125 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
1126 int check_descriptor(const struct domain *dom, struct desc_struct *d)
1128 u32 a = d->a, b = d->b;
1129 u16 cs;
1130 unsigned int dpl;
1132 /* A not-present descriptor will always fault, so is safe. */
1133 if ( !(b & _SEGMENT_P) )
1134 goto good;
1136 /* Check and fix up the DPL. */
1137 dpl = (b >> 13) & 3;
1138 __fixup_guest_selector(dom, dpl);
1139 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
1141 /* All code and data segments are okay. No base/limit checking. */
1142 if ( (b & _SEGMENT_S) )
1144 if ( is_pv_32bit_domain(dom) )
1146 unsigned long base, limit;
1148 if ( b & _SEGMENT_L )
1149 goto bad;
1151 /*
1152 * Older PAE Linux guests use segments which are limited to
1153 * 0xf6800000. Extend these to allow access to the larger read-only
1154 * M2P table available in 32on64 mode.
1155 */
1156 base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16);
1158 limit = (b & 0xf0000) | (a & 0xffff);
1159 limit++; /* We add one because limit is inclusive. */
1161 if ( (b & _SEGMENT_G) )
1162 limit <<= 12;
1164 if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
1166 a |= 0x0000ffff;
1167 b |= 0x000f0000;
1171 goto good;
1174 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
1175 if ( (b & _SEGMENT_TYPE) == 0x000 )
1176 goto good;
1178 /* Everything but a call gate is discarded here. */
1179 if ( (b & _SEGMENT_TYPE) != 0xc00 )
1180 goto bad;
1182 /* Validate the target code selector. */
1183 cs = a >> 16;
1184 if ( !guest_gate_selector_okay(dom, cs) )
1185 goto bad;
1186 /*
1187 * Force DPL to zero, causing a GP fault with its error code indicating
1188 * the gate in use, allowing emulation. This is necessary because with
1189 * native guests (kernel in ring 3) call gates cannot be used directly
1190 * to transition from user to kernel mode (and whether a gate is used
1191 * to enter the kernel can only be determined when the gate is being
1192 * used), and with compat guests call gates cannot be used at all as
1193 * there are only 64-bit ones.
1194 * Store the original DPL in the selector's RPL field.
1195 */
1196 b &= ~_SEGMENT_DPL;
1197 cs = (cs & ~3) | dpl;
1198 a = (a & 0xffffU) | (cs << 16);
1200 /* Reserved bits must be zero. */
1201 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
1202 goto bad;
1204 good:
1205 d->a = a;
1206 d->b = b;
1207 return 1;
1208 bad:
1209 return 0;
1212 int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs)
1214 struct domain *d = current->domain;
1216 return mem_hotplug && guest_mode(regs) && is_pv_32bit_domain(d) &&
1217 (addr >= HYPERVISOR_COMPAT_VIRT_START(d)) &&
1218 (addr < MACH2PHYS_COMPAT_VIRT_END);
1221 int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
1223 struct domain *d = current->domain;
1224 l4_pgentry_t *pl4e = NULL;
1225 l4_pgentry_t l4e;
1226 l3_pgentry_t *pl3e = NULL;
1227 l3_pgentry_t l3e;
1228 l2_pgentry_t *pl2e = NULL;
1229 l2_pgentry_t l2e, idle_l2e;
1230 unsigned long mfn, idle_index;
1231 int ret = 0;
1233 if (!is_pv_32on64_domain(d))
1234 return 0;
1236 if ( (addr < HYPERVISOR_COMPAT_VIRT_START(d)) ||
1237 (addr >= MACH2PHYS_COMPAT_VIRT_END) )
1238 return 0;
1240 mfn = (read_cr3()) >> PAGE_SHIFT;
1242 pl4e = map_domain_page(mfn);
1244 l4e = pl4e[0];
1246 if (!(l4e_get_flags(l4e) & _PAGE_PRESENT))
1247 goto unmap;
1249 mfn = l4e_get_pfn(l4e);
1250 /* We don't need get page type here since it is current CR3 */
1251 pl3e = map_domain_page(mfn);
1253 l3e = pl3e[3];
1255 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1256 goto unmap;
1258 mfn = l3e_get_pfn(l3e);
1259 pl2e = map_domain_page(mfn);
1261 l2e = pl2e[l2_table_offset(addr)];
1263 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT))
1264 goto unmap;
1266 idle_index = (l2_table_offset(addr) -
1267 COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d))/
1268 sizeof(l2_pgentry_t);
1269 idle_l2e = compat_idle_pg_table_l2[idle_index];
1270 if (!(l2e_get_flags(idle_l2e) & _PAGE_PRESENT))
1271 goto unmap;
1273 memcpy(&pl2e[l2_table_offset(addr)],
1274 &compat_idle_pg_table_l2[idle_index],
1275 sizeof(l2_pgentry_t));
1277 ret = EXCRET_fault_fixed;
1279 unmap:
1280 if ( pl4e )
1281 unmap_domain_page(pl4e);
1282 if ( pl3e )
1283 unmap_domain_page(pl3e);
1284 if ( pl2e )
1285 unmap_domain_page(pl2e);
1287 return ret;
1290 void domain_set_alloc_bitsize(struct domain *d)
1292 if ( !is_pv_32on64_domain(d) ||
1293 (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
1294 d->arch.physaddr_bitsize > 0 )
1295 return;
1296 d->arch.physaddr_bitsize =
1297 /* 2^n entries can be contained in guest's p2m mapping space */
1298 fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
1299 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
1300 + PAGE_SHIFT;
1303 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
1305 if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
1306 return bits;
1307 return min(d->arch.physaddr_bitsize, bits);
1310 int transfer_pages_to_heap(struct mem_hotadd_info *info)
1312 unsigned long i;
1313 struct page_info *pg;
1315 /*
1316 * Mark the allocated page before put free pages to buddy allocator
1317 * to avoid merge in free_heap_pages
1318 */
1319 for (i = info->spfn; i < info->cur; i++)
1321 pg = mfn_to_page(i);
1322 pg->count_info = PGC_state_inuse;
1325 init_domheap_pages(pfn_to_paddr(info->cur), pfn_to_paddr(info->epfn));
1327 return 0;
1330 int mem_hotadd_check(unsigned long spfn, unsigned long epfn)
1332 unsigned long s, e, length, sidx, eidx;
1334 if ( (spfn >= epfn) )
1335 return 0;
1337 if (pfn_to_pdx(epfn) > (FRAMETABLE_SIZE / sizeof(*frame_table)))
1338 return 0;
1340 if ( (spfn | epfn) & ((1UL << PAGETABLE_ORDER) - 1) )
1341 return 0;
1343 if ( (spfn | epfn) & pfn_hole_mask )
1344 return 0;
1346 /* Make sure the new range is not present now */
1347 sidx = ((pfn_to_pdx(spfn) + PDX_GROUP_COUNT - 1) & ~(PDX_GROUP_COUNT - 1))
1348 / PDX_GROUP_COUNT;
1349 eidx = (pfn_to_pdx(epfn - 1) & ~(PDX_GROUP_COUNT - 1)) / PDX_GROUP_COUNT;
1350 if (sidx >= eidx)
1351 return 0;
1353 s = find_next_zero_bit(pdx_group_valid, eidx, sidx);
1354 if ( s > eidx )
1355 return 0;
1356 e = find_next_bit(pdx_group_valid, eidx, s);
1357 if ( e < eidx )
1358 return 0;
1360 /* Caculate at most required m2p/compat m2p/frametable pages */
1361 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1));
1362 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 3)) - 1) &
1363 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1);
1365 length = (e - s) * sizeof(unsigned long);
1367 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1));
1368 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) &
1369 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1);
1371 e = min_t(unsigned long, e,
1372 (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2);
1374 if ( e > s )
1375 length += (e -s) * sizeof(unsigned int);
1377 s = pfn_to_pdx(spfn) & ~(PDX_GROUP_COUNT - 1);
1378 e = ( pfn_to_pdx(epfn) + (PDX_GROUP_COUNT - 1) ) & ~(PDX_GROUP_COUNT - 1);
1380 length += (e - s) * sizeof(struct page_info);
1382 if ((length >> PAGE_SHIFT) > (epfn - spfn))
1383 return 0;
1385 return 1;
1388 /*
1389 * A bit paranoid for memory allocation failure issue since
1390 * it may be reason for memory add
1391 */
1392 int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
1394 struct mem_hotadd_info info;
1395 int ret, node;
1396 unsigned long old_max = max_page, old_total = total_pages;
1397 unsigned long old_node_start, old_node_span, orig_online;
1398 unsigned long i;
1400 dprintk(XENLOG_INFO, "memory_add %lx ~ %lx with pxm %x\n", spfn, epfn, pxm);
1402 if ( !mem_hotadd_check(spfn, epfn) )
1403 return -EINVAL;
1405 if ( (node = setup_node(pxm)) == -1 )
1406 return -EINVAL;
1408 if ( !valid_numa_range(spfn << PAGE_SHIFT, epfn << PAGE_SHIFT, node) )
1410 dprintk(XENLOG_WARNING, "spfn %lx ~ epfn %lx pxm %x node %x"
1411 "is not numa valid", spfn, epfn, pxm, node);
1412 return -EINVAL;
1415 ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), spfn,
1416 epfn - spfn, PAGE_HYPERVISOR);
1417 if ( ret )
1418 return ret;
1420 old_node_start = NODE_DATA(node)->node_start_pfn;
1421 old_node_span = NODE_DATA(node)->node_spanned_pages;
1422 orig_online = node_online(node);
1424 if ( !orig_online )
1426 dprintk(XENLOG_WARNING, "node %x pxm %x is not online\n",node, pxm);
1427 NODE_DATA(node)->node_id = node;
1428 NODE_DATA(node)->node_start_pfn = spfn;
1429 NODE_DATA(node)->node_spanned_pages =
1430 epfn - node_start_pfn(node);
1431 node_set_online(node);
1432 }else
1434 if (NODE_DATA(node)->node_start_pfn > spfn)
1435 NODE_DATA(node)->node_start_pfn = spfn;
1436 if (node_end_pfn(node) < epfn)
1437 NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node);
1440 ret = -EINVAL;
1441 info.spfn = spfn;
1442 info.epfn = epfn;
1443 info.cur = spfn;
1445 ret = extend_frame_table(&info);
1446 if (ret)
1447 goto destroy_frametable;
1449 /* Set max_page as setup_m2p_table will use it*/
1450 if (max_page < epfn)
1452 max_page = epfn;
1453 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1455 total_pages += epfn - spfn;
1457 set_pdx_range(spfn, epfn);
1458 ret = setup_m2p_table(&info);
1460 if ( ret )
1461 goto destroy_m2p;
1463 for ( i = spfn; i < epfn; i++ )
1464 if ( iommu_map_page(dom0, i, i) )
1465 break;
1467 if ( i != epfn )
1468 goto destroy_iommu;
1470 /* We can't revert any more */
1471 transfer_pages_to_heap(&info);
1473 share_hotadd_m2p_table(&info);
1475 return 0;
1477 destroy_iommu:
1478 while (i-- > old_max)
1479 iommu_unmap_page(dom0, i);
1481 destroy_m2p:
1482 destroy_m2p_mapping(&info);
1483 max_page = old_max;
1484 total_pages = old_total;
1485 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1486 destroy_frametable:
1487 cleanup_frame_table(&info);
1488 destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1489 (unsigned long)mfn_to_virt(epfn));
1491 if ( !orig_online )
1492 node_set_offline(node);
1493 NODE_DATA(node)->node_start_pfn = old_node_start;
1494 NODE_DATA(node)->node_spanned_pages = old_node_span;
1496 destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1497 (unsigned long)mfn_to_virt(epfn));
1498 return ret;
1501 #include "compat/mm.c"
1503 /*
1504 * Local variables:
1505 * mode: C
1506 * c-set-style: "BSD"
1507 * c-basic-offset: 4
1508 * tab-width: 4
1509 * indent-tabs-mode: nil
1510 * End:
1511 */