debuggers.hg

view xen/arch/x86/x86_64/mm.c @ 20800:19479955c074

numa: Correct handling node with CPU populated but no memory populated

In changeset 20599, the node that has no memory populated is marked
parsed, but not online. However, if there are CPU populated in this
node, the corresponding CPU mapping (i.e. the cpu_to_node) is still
setup to the offline node, this will cause trouble for memory
allocation.

This patch changes the init_cpu_to_node() and srant_detect_node(), to
considering the node is offlined situation.

Now the apicid_to_node is only used to keep the mapping between
cpu/node provided by BIOS, and should not be used for memory
allocation anymore.

One thing left is to update the cpu_to_node mapping after memory
populated by memory hot-add.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>

This is a reintroduction of 20726:ddb8c5e798f9, which I incorrectly
reverted in 20745:d3215a968db9

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jan 05 08:38:23 2010 +0000 (2010-01-05)
parents d3215a968db9
children 217f6aa87716
line source
1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc., 59
17 * Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <xen/config.h>
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/numa.h>
26 #include <xen/nodemask.h>
27 #include <xen/guest_access.h>
28 #include <asm/current.h>
29 #include <asm/asm_defns.h>
30 #include <asm/page.h>
31 #include <asm/flushtlb.h>
32 #include <asm/fixmap.h>
33 #include <asm/hypercall.h>
34 #include <asm/msr.h>
35 #include <asm/setup.h>
36 #include <asm/numa.h>
37 #include <public/memory.h>
39 /* Parameters for PFN/MADDR compression. */
40 unsigned long __read_mostly max_pdx;
41 unsigned long __read_mostly pfn_pdx_bottom_mask = ~0UL;
42 unsigned long __read_mostly ma_va_bottom_mask = ~0UL;
43 unsigned long __read_mostly pfn_top_mask = 0;
44 unsigned long __read_mostly ma_top_mask = 0;
45 unsigned long __read_mostly pfn_hole_mask = 0;
46 unsigned int __read_mostly pfn_pdx_hole_shift = 0;
48 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
50 DEFINE_PER_CPU_READ_MOSTLY(void *, compat_arg_xlat);
52 /* Top-level master (and idle-domain) page directory. */
53 l4_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
54 idle_pg_table[L4_PAGETABLE_ENTRIES];
56 /* Enough page directories to map bottom 4GB of the memory map. */
57 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
58 l3_identmap[L3_PAGETABLE_ENTRIES];
59 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
60 l2_identmap[4*L2_PAGETABLE_ENTRIES];
62 /* Enough page directories to map the Xen text and static data. */
63 l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
64 l3_xenmap[L3_PAGETABLE_ENTRIES];
65 l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
66 l2_xenmap[L2_PAGETABLE_ENTRIES];
68 int __mfn_valid(unsigned long mfn)
69 {
70 return likely(mfn < max_page) &&
71 likely(!(mfn & pfn_hole_mask)) &&
72 likely(test_bit(pfn_to_pdx(mfn) / PDX_GROUP_COUNT,
73 pdx_group_valid));
74 }
76 void *alloc_xen_pagetable(void)
77 {
78 unsigned long mfn;
80 if ( !early_boot )
81 {
82 struct page_info *pg = alloc_domheap_page(NULL, 0);
83 BUG_ON(pg == NULL);
84 return page_to_virt(pg);
85 }
87 mfn = alloc_boot_pages(1, 1);
88 return mfn_to_virt(mfn);
89 }
91 l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
92 {
93 l4_pgentry_t *pl4e;
95 pl4e = &idle_pg_table[l4_table_offset(v)];
96 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
97 {
98 l3_pgentry_t *pl3e = alloc_xen_pagetable();
99 clear_page(pl3e);
100 l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
101 }
103 return l4e_to_l3e(*pl4e) + l3_table_offset(v);
104 }
106 l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
107 {
108 l3_pgentry_t *pl3e;
110 pl3e = virt_to_xen_l3e(v);
111 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
112 {
113 l2_pgentry_t *pl2e = alloc_xen_pagetable();
114 clear_page(pl2e);
115 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
116 }
118 BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
119 return l3e_to_l2e(*pl3e) + l2_table_offset(v);
120 }
122 void *do_page_walk(struct vcpu *v, unsigned long addr)
123 {
124 unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
125 l4_pgentry_t l4e, *l4t;
126 l3_pgentry_t l3e, *l3t;
127 l2_pgentry_t l2e, *l2t;
128 l1_pgentry_t l1e, *l1t;
130 if ( is_hvm_vcpu(v) )
131 return NULL;
133 l4t = mfn_to_virt(mfn);
134 l4e = l4t[l4_table_offset(addr)];
135 mfn = l4e_get_pfn(l4e);
136 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
137 return NULL;
139 l3t = mfn_to_virt(mfn);
140 l3e = l3t[l3_table_offset(addr)];
141 mfn = l3e_get_pfn(l3e);
142 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
143 return NULL;
144 if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
145 return mfn_to_virt(mfn) + (addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
147 l2t = mfn_to_virt(mfn);
148 l2e = l2t[l2_table_offset(addr)];
149 mfn = l2e_get_pfn(l2e);
150 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
151 return NULL;
152 if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
153 return mfn_to_virt(mfn) + (addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
155 l1t = mfn_to_virt(mfn);
156 l1e = l1t[l1_table_offset(addr)];
157 mfn = l1e_get_pfn(l1e);
158 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
159 return NULL;
161 return mfn_to_virt(mfn) + (addr & ~PAGE_MASK);
162 }
164 void __init pfn_pdx_hole_setup(unsigned long mask)
165 {
166 unsigned int i, j, bottom_shift, hole_shift;
168 for ( hole_shift = bottom_shift = j = 0; ; )
169 {
170 i = find_next_zero_bit(&mask, BITS_PER_LONG, j);
171 j = find_next_bit(&mask, BITS_PER_LONG, i);
172 if ( j >= BITS_PER_LONG )
173 break;
174 if ( j - i > hole_shift )
175 {
176 hole_shift = j - i;
177 bottom_shift = i;
178 }
179 }
180 if ( !hole_shift )
181 return;
183 printk(KERN_INFO "PFN compression on bits %u...%u\n",
184 bottom_shift, bottom_shift + hole_shift - 1);
186 pfn_pdx_hole_shift = hole_shift;
187 pfn_pdx_bottom_mask = (1UL << bottom_shift) - 1;
188 ma_va_bottom_mask = (PAGE_SIZE << bottom_shift) - 1;
189 pfn_hole_mask = ((1UL << hole_shift) - 1) << bottom_shift;
190 pfn_top_mask = ~(pfn_pdx_bottom_mask | pfn_hole_mask);
191 ma_top_mask = pfn_top_mask << PAGE_SHIFT;
192 }
194 /*
195 * Allocate page table pages for m2p table
196 */
197 struct mem_hotadd_info
198 {
199 unsigned long spfn;
200 unsigned long epfn;
201 unsigned long cur;
202 };
204 int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info)
205 {
206 return (pfn < info->epfn && pfn >= info->spfn);
207 }
209 static unsigned long alloc_hotadd_mfn(struct mem_hotadd_info *info)
210 {
211 unsigned mfn;
213 ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) &&
214 info->cur >= info->spfn);
216 mfn = info->cur;
217 info->cur += (1UL << PAGETABLE_ORDER);
218 return mfn;
219 }
221 #define M2P_NO_MAPPED 0
222 #define M2P_2M_MAPPED 1
223 #define M2P_1G_MAPPED 2
224 static int m2p_mapped(unsigned long spfn)
225 {
226 unsigned long va;
227 l3_pgentry_t *l3_ro_mpt;
228 l2_pgentry_t *l2_ro_mpt;
230 va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping);
231 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
233 switch ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
234 (_PAGE_PRESENT |_PAGE_PSE))
235 {
236 case _PAGE_PSE|_PAGE_PRESENT:
237 return M2P_1G_MAPPED;
238 break;
239 /* Check for next level */
240 case _PAGE_PRESENT:
241 break;
242 default:
243 return M2P_NO_MAPPED;
244 break;
245 }
246 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
248 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
249 return M2P_2M_MAPPED;
251 return M2P_NO_MAPPED;
252 }
254 int share_hotadd_m2p_table(struct mem_hotadd_info *info)
255 {
256 unsigned long i, n, v, m2p_start_mfn = 0;
257 l3_pgentry_t l3e;
258 l2_pgentry_t l2e;
260 /* M2P table is mappable read-only by privileged domains. */
261 for ( v = RDWR_MPT_VIRT_START;
262 v != RDWR_MPT_VIRT_END;
263 v += n << PAGE_SHIFT )
264 {
265 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
266 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
267 l3_table_offset(v)];
268 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
269 continue;
270 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
271 {
272 n = L1_PAGETABLE_ENTRIES;
273 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
274 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
275 continue;
276 m2p_start_mfn = l2e_get_pfn(l2e);
277 }
278 else
279 continue;
281 for ( i = 0; i < n; i++ )
282 {
283 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
284 if (hotadd_mem_valid(m2p_start_mfn + i, info))
285 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
286 }
287 }
289 for ( v = RDWR_COMPAT_MPT_VIRT_START;
290 v != RDWR_COMPAT_MPT_VIRT_END;
291 v += 1 << L2_PAGETABLE_SHIFT )
292 {
293 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
294 l3_table_offset(v)];
295 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
296 continue;
297 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
298 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
299 continue;
300 m2p_start_mfn = l2e_get_pfn(l2e);
302 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
303 {
304 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
305 if (hotadd_mem_valid(m2p_start_mfn + i, info))
306 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
307 }
308 }
309 return 0;
310 }
312 static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info)
313 {
314 unsigned long i, va, rwva, pt_pfn;
315 unsigned long smap = info->spfn, emap = info->spfn;
317 l3_pgentry_t *l3_ro_mpt;
318 l2_pgentry_t *l2_ro_mpt;
320 if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
321 return;
323 if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
324 emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
326 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
328 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]) & _PAGE_PRESENT);
330 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
332 for ( i = smap; i < emap; )
333 {
334 va = HIRO_COMPAT_MPT_VIRT_START +
335 i * sizeof(*compat_machine_to_phys_mapping);
336 rwva = RDWR_COMPAT_MPT_VIRT_START +
337 i * sizeof(*compat_machine_to_phys_mapping);
338 if ( l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT )
339 {
340 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
341 if ( hotadd_mem_valid(pt_pfn, info) )
342 {
343 destroy_xen_mappings(rwva, rwva +
344 (1UL << L2_PAGETABLE_SHIFT));
345 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
346 }
347 }
349 i += 1UL < (L2_PAGETABLE_SHIFT - 2);
350 }
352 return;
353 }
355 void destroy_m2p_mapping(struct mem_hotadd_info *info)
356 {
357 l3_pgentry_t *l3_ro_mpt;
358 unsigned long i, va, rwva;
359 unsigned long smap = info->spfn, emap = info->epfn;
361 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
363 /*
364 * No need to clean m2p structure existing before the hotplug
365 */
366 for (i = smap; i < emap;)
367 {
368 unsigned long pt_pfn;
369 l2_pgentry_t *l2_ro_mpt;
371 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
372 rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
374 /* 1G mapping should not be created by mem hotadd */
375 if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) ||
376 (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE))
377 {
378 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
379 (1UL << (L3_PAGETABLE_SHIFT - 3) );
380 continue;
381 }
383 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
384 if (!(l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT))
385 {
386 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
387 (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
388 continue;
389 }
391 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
392 if ( hotadd_mem_valid(pt_pfn, info) )
393 {
394 destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
396 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
397 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
398 }
399 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
400 (1UL << (L2_PAGETABLE_SHIFT - 3));
401 }
403 destroy_compat_m2p_mapping(info);
405 /* Brute-Force flush all TLB */
406 flush_tlb_all();
407 return;
408 }
410 /*
411 * Allocate and map the compatibility mode machine-to-phys table.
412 * spfn/epfn: the pfn ranges to be setup
413 * free_s/free_e: the pfn ranges that is free still
414 */
415 static int setup_compat_m2p_table(struct mem_hotadd_info *info)
416 {
417 unsigned long i, va, smap, emap, rwva, epfn = info->epfn;
418 unsigned int n, memflags;
419 l3_pgentry_t *l3_ro_mpt = NULL;
420 l2_pgentry_t *l2_ro_mpt = NULL;
421 struct page_info *l1_pg;
423 smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1));
425 /*
426 * Notice: For hot-added memory, only range below m2p_compat_vstart
427 * will be filled up (assuming memory is discontinous when booting).
428 */
429 if ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) )
430 return 0;
432 if (epfn > (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START))
433 epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
435 emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) &
436 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) );
438 va = HIRO_COMPAT_MPT_VIRT_START +
439 smap * sizeof(*compat_machine_to_phys_mapping);
440 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
442 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT);
444 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
446 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
447 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
448 sizeof(*compat_machine_to_phys_mapping))
449 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
450 sizeof(*compat_machine_to_phys_mapping));
452 for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) )
453 {
454 va = HIRO_COMPAT_MPT_VIRT_START +
455 i * sizeof(*compat_machine_to_phys_mapping);
457 rwva = RDWR_COMPAT_MPT_VIRT_START +
458 i * sizeof(*compat_machine_to_phys_mapping);
460 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
461 continue;
463 for ( n = 0; n < CNT; ++n)
464 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
465 break;
466 if ( n == CNT )
467 continue;
469 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
471 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
472 map_pages_to_xen(rwva,
473 page_to_mfn(l1_pg),
474 1UL << PAGETABLE_ORDER,
475 PAGE_HYPERVISOR);
476 memset((void *)rwva, 0x55, 1UL << L2_PAGETABLE_SHIFT);
477 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
478 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
479 }
480 #undef CNT
481 #undef MFN
482 return 0;
483 }
485 /*
486 * Allocate and map the machine-to-phys table.
487 * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already
488 */
489 int setup_m2p_table(struct mem_hotadd_info *info)
490 {
491 unsigned long i, va, smap, emap;
492 unsigned int n, memflags;
493 l2_pgentry_t *l2_ro_mpt = NULL;
494 l3_pgentry_t *l3_ro_mpt = NULL;
495 struct page_info *l1_pg, *l2_pg;
496 int ret = 0;
498 ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)])
499 & _PAGE_PRESENT);
500 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
502 smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)));
503 emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) &
504 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1));
506 va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping);
508 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
509 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
510 sizeof(*machine_to_phys_mapping))
512 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
513 sizeof(*machine_to_phys_mapping));
515 i = smap;
516 while ( i < emap )
517 {
518 switch ( m2p_mapped(i) )
519 {
520 case M2P_1G_MAPPED:
521 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
522 (1UL << (L3_PAGETABLE_SHIFT - 3));
523 continue;
524 case M2P_2M_MAPPED:
525 i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
526 (1UL << (L2_PAGETABLE_SHIFT - 3));
527 continue;
528 default:
529 break;
530 }
532 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
533 memflags = MEMF_node(phys_to_nid(i << PAGE_SHIFT));
535 for ( n = 0; n < CNT; ++n)
536 if ( mfn_valid(i + n * PDX_GROUP_COUNT) )
537 break;
538 if ( n == CNT )
539 l1_pg = NULL;
540 else
541 {
542 l1_pg = mfn_to_page(alloc_hotadd_mfn(info));
543 map_pages_to_xen(
544 RDWR_MPT_VIRT_START + i * sizeof(unsigned long),
545 page_to_mfn(l1_pg),
546 1UL << PAGETABLE_ORDER,
547 PAGE_HYPERVISOR);
548 memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)),
549 0x55, 1UL << L2_PAGETABLE_SHIFT);
551 ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
552 _PAGE_PSE));
553 if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
554 _PAGE_PRESENT )
555 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) +
556 l2_table_offset(va);
557 else
558 {
559 l2_pg = alloc_domheap_page(NULL, memflags);
561 if (!l2_pg)
562 {
563 ret = -ENOMEM;
564 goto error;
565 }
567 l2_ro_mpt = page_to_virt(l2_pg);
568 clear_page(l2_ro_mpt);
569 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
570 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
571 l2_ro_mpt += l2_table_offset(va);
572 }
574 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
575 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg,
576 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
577 }
578 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
579 l2_ro_mpt = NULL;
580 i += ( 1UL << (L2_PAGETABLE_SHIFT - 3));
581 }
582 #undef CNT
583 #undef MFN
585 ret = setup_compat_m2p_table(info);
586 error:
587 return ret;
588 }
590 void __init paging_init(void)
591 {
592 unsigned long i, mpt_size, va;
593 unsigned int n, memflags;
594 l3_pgentry_t *l3_ro_mpt;
595 l2_pgentry_t *l2_ro_mpt = NULL;
596 struct page_info *l1_pg, *l2_pg, *l3_pg;
598 /*
599 * We setup the L3s for 1:1 mapping if host support memory hotplug
600 * to avoid sync the 1:1 mapping on page fault handler
601 */
602 if ( mem_hotplug )
603 {
604 unsigned long va;
606 for ( va = DIRECTMAP_VIRT_START;
607 va < DIRECTMAP_VIRT_END;
608 va += (1UL << L4_PAGETABLE_SHIFT) )
609 {
610 if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
611 _PAGE_PRESENT) )
612 {
613 l3_pg = alloc_domheap_page(NULL, 0);
614 if ( !l3_pg )
615 goto nomem;
616 l3_ro_mpt = page_to_virt(l3_pg);
617 clear_page(l3_ro_mpt);
618 l4e_write(&idle_pg_table[l4_table_offset(va)],
619 l4e_from_page(l3_pg, __PAGE_HYPERVISOR));
620 }
621 }
622 }
624 /* Create user-accessible L2 directory to map the MPT for guests. */
625 if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL )
626 goto nomem;
627 l3_ro_mpt = page_to_virt(l3_pg);
628 clear_page(l3_ro_mpt);
629 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
630 l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER));
632 /*
633 * Allocate and map the machine-to-phys table.
634 * This also ensures L3 is present for fixmaps.
635 */
636 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
637 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
638 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
639 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
640 sizeof(*machine_to_phys_mapping))
641 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
642 sizeof(*machine_to_phys_mapping));
643 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
644 {
645 BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
646 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
647 memflags = MEMF_node(phys_to_nid(i <<
648 (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
650 if ( cpu_has_page1gb &&
651 !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
652 (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) )
653 {
654 unsigned int k, holes;
656 for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
657 {
658 for ( n = 0; n < CNT; ++n)
659 if ( mfn_valid(MFN(i + k) + n * PDX_GROUP_COUNT) )
660 break;
661 if ( n == CNT )
662 ++holes;
663 }
664 if ( k == holes )
665 {
666 i += (1UL << PAGETABLE_ORDER) - 1;
667 continue;
668 }
669 if ( holes == 0 &&
670 (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
671 memflags)) != NULL )
672 {
673 map_pages_to_xen(
674 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
675 page_to_mfn(l1_pg),
676 1UL << (2 * PAGETABLE_ORDER),
677 PAGE_HYPERVISOR);
678 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
679 0x77, 1UL << L3_PAGETABLE_SHIFT);
681 ASSERT(!l2_table_offset(va));
682 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
683 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
684 l3e_from_page(l1_pg,
685 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
686 i += (1UL << PAGETABLE_ORDER) - 1;
687 continue;
688 }
689 }
691 for ( n = 0; n < CNT; ++n)
692 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
693 break;
694 if ( n == CNT )
695 l1_pg = NULL;
696 else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
697 memflags)) == NULL )
698 goto nomem;
699 else
700 {
701 map_pages_to_xen(
702 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
703 page_to_mfn(l1_pg),
704 1UL << PAGETABLE_ORDER,
705 PAGE_HYPERVISOR);
706 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
707 0x55, 1UL << L2_PAGETABLE_SHIFT);
708 }
709 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
710 {
711 if ( (l2_pg = alloc_domheap_page(NULL, memflags)) == NULL )
712 goto nomem;
713 l2_ro_mpt = page_to_virt(l2_pg);
714 clear_page(l2_ro_mpt);
715 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
716 l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
717 ASSERT(!l2_table_offset(va));
718 }
719 /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
720 if ( l1_pg )
721 l2e_write(l2_ro_mpt, l2e_from_page(
722 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
723 l2_ro_mpt++;
724 }
725 #undef CNT
726 #undef MFN
728 /* Create user-accessible L2 directory to map the MPT for compat guests. */
729 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
730 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
731 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
732 HIRO_COMPAT_MPT_VIRT_START)]);
733 if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
734 goto nomem;
735 compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
736 clear_page(l2_ro_mpt);
737 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
738 l3e_from_page(l2_pg, __PAGE_HYPERVISOR));
739 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
740 /* Allocate and map the compatibility mode machine-to-phys table. */
741 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
742 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
743 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
744 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
745 if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
746 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
747 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
748 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
749 sizeof(*compat_machine_to_phys_mapping))
750 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
751 sizeof(*compat_machine_to_phys_mapping));
752 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ )
753 {
754 memflags = MEMF_node(phys_to_nid(i <<
755 (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
756 for ( n = 0; n < CNT; ++n)
757 if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) )
758 break;
759 if ( n == CNT )
760 continue;
761 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
762 memflags)) == NULL )
763 goto nomem;
764 map_pages_to_xen(
765 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
766 page_to_mfn(l1_pg),
767 1UL << PAGETABLE_ORDER,
768 PAGE_HYPERVISOR);
769 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
770 (i << L2_PAGETABLE_SHIFT)),
771 0x55,
772 1UL << L2_PAGETABLE_SHIFT);
773 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
774 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
775 }
776 #undef CNT
777 #undef MFN
779 /* Set up linear page table mapping. */
780 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
781 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR));
782 return;
784 nomem:
785 panic("Not enough memory for m2p table\n");
786 }
788 void __init setup_idle_pagetable(void)
789 {
790 /* Install per-domain mappings for idle domain. */
791 l4e_write(&idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)],
792 l4e_from_page(
793 virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
794 __PAGE_HYPERVISOR));
795 }
797 void __init zap_low_mappings(void)
798 {
799 BUG_ON(num_online_cpus() != 1);
801 /* Remove aliased mapping of first 1:1 PML4 entry. */
802 l4e_write(&idle_pg_table[0], l4e_empty());
803 flush_local(FLUSH_TLB_GLOBAL);
805 /* Replace with mapping of the boot trampoline only. */
806 map_pages_to_xen(BOOT_TRAMPOLINE, BOOT_TRAMPOLINE >> PAGE_SHIFT,
807 0x10, __PAGE_HYPERVISOR);
808 }
810 int __cpuinit setup_compat_arg_xlat(unsigned int cpu, int node)
811 {
812 unsigned int order = get_order_from_bytes(COMPAT_ARG_XLAT_SIZE);
813 unsigned long sz = PAGE_SIZE << order;
814 unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
815 struct page_info *pg;
817 pg = alloc_domheap_pages(NULL, order, memflags);
818 if ( !pg )
819 return -ENOMEM;
821 for ( ; (sz -= PAGE_SIZE) >= COMPAT_ARG_XLAT_SIZE; ++pg )
822 free_domheap_page(pg);
824 per_cpu(compat_arg_xlat, cpu) = page_to_virt(pg);
826 return 0;
827 }
829 void cleanup_frame_table(struct mem_hotadd_info *info)
830 {
831 unsigned long sva, eva;
832 l3_pgentry_t l3e;
833 l2_pgentry_t l2e;
834 unsigned long spfn, epfn;
836 spfn = info->spfn;
837 epfn = info->epfn;
839 sva = (unsigned long)pdx_to_page(pfn_to_pdx(spfn));
840 eva = (unsigned long)pdx_to_page(pfn_to_pdx(epfn));
842 /* Intialize all page */
843 memset(mfn_to_page(spfn), -1,
844 (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
846 while (sva < eva)
847 {
848 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(sva)])[
849 l3_table_offset(sva)];
850 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
851 (l3e_get_flags(l3e) & _PAGE_PSE) )
852 {
853 sva = (sva & ~((1UL << L3_PAGETABLE_SHIFT) - 1)) +
854 (1UL << L3_PAGETABLE_SHIFT);
855 continue;
856 }
858 l2e = l3e_to_l2e(l3e)[l2_table_offset(sva)];
859 ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
861 if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) ==
862 (_PAGE_PSE | _PAGE_PRESENT) )
863 {
864 if (hotadd_mem_valid(l2e_get_pfn(l2e), info))
865 destroy_xen_mappings(sva & ~((1UL << L2_PAGETABLE_SHIFT) - 1),
866 ((sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
867 (1UL << L2_PAGETABLE_SHIFT) - 1));
869 sva = (sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
870 (1UL << L2_PAGETABLE_SHIFT);
871 continue;
872 }
874 ASSERT(l1e_get_flags(l2e_to_l1e(l2e)[l1_table_offset(sva)]) &
875 _PAGE_PRESENT);
876 sva = (sva & ~((1UL << PAGE_SHIFT) - 1)) +
877 (1UL << PAGE_SHIFT);
878 }
880 /* Brute-Force flush all TLB */
881 flush_tlb_all();
882 }
884 /* Should we be paraniod failure in map_pages_to_xen? */
885 static int setup_frametable_chunk(void *start, void *end,
886 struct mem_hotadd_info *info)
887 {
888 unsigned long s = (unsigned long)start;
889 unsigned long e = (unsigned long)end;
890 unsigned long mfn;
892 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
893 ASSERT(!(e & ((1 << L2_PAGETABLE_SHIFT) - 1)));
895 for ( ; s < e; s += (1UL << L2_PAGETABLE_SHIFT))
896 {
897 mfn = alloc_hotadd_mfn(info);
898 map_pages_to_xen(s, mfn, 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR);
899 }
900 memset(start, -1, s - (unsigned long)start);
902 return 0;
903 }
905 int extend_frame_table(struct mem_hotadd_info *info)
906 {
907 unsigned long cidx, nidx, eidx, spfn, epfn;
909 spfn = info->spfn;
910 epfn = info->epfn;
912 eidx = (pfn_to_pdx(epfn) + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
913 nidx = cidx = pfn_to_pdx(spfn)/PDX_GROUP_COUNT;
915 ASSERT( pfn_to_pdx(epfn) <= (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
916 (pfn_to_pdx(epfn) <= FRAMETABLE_SIZE / sizeof(struct page_info)) );
918 if ( test_bit(cidx, pdx_group_valid) )
919 cidx = find_next_zero_bit(pdx_group_valid, eidx, cidx);
921 if ( cidx >= eidx )
922 return 0;
924 while ( cidx < eidx )
925 {
926 nidx = find_next_bit(pdx_group_valid, eidx, cidx);
927 if ( nidx >= eidx )
928 nidx = eidx;
929 setup_frametable_chunk(pdx_to_page(cidx * PDX_GROUP_COUNT ),
930 pdx_to_page(nidx * PDX_GROUP_COUNT),
931 info);
933 cidx = find_next_zero_bit(pdx_group_valid, eidx, nidx);
934 }
936 memset(mfn_to_page(spfn), 0,
937 (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
938 return 0;
939 }
941 void __init subarch_init_memory(void)
942 {
943 unsigned long i, n, v, m2p_start_mfn;
944 l3_pgentry_t l3e;
945 l2_pgentry_t l2e;
947 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
948 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
949 /* M2P table is mappable read-only by privileged domains. */
950 for ( v = RDWR_MPT_VIRT_START;
951 v != RDWR_MPT_VIRT_END;
952 v += n << PAGE_SHIFT )
953 {
954 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
955 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
956 l3_table_offset(v)];
957 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
958 continue;
959 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
960 {
961 n = L1_PAGETABLE_ENTRIES;
962 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
963 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
964 continue;
965 m2p_start_mfn = l2e_get_pfn(l2e);
966 }
967 else
968 {
969 m2p_start_mfn = l3e_get_pfn(l3e);
970 }
972 for ( i = 0; i < n; i++ )
973 {
974 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
975 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
976 }
977 }
979 for ( v = RDWR_COMPAT_MPT_VIRT_START;
980 v != RDWR_COMPAT_MPT_VIRT_END;
981 v += 1 << L2_PAGETABLE_SHIFT )
982 {
983 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
984 l3_table_offset(v)];
985 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
986 continue;
987 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
988 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
989 continue;
990 m2p_start_mfn = l2e_get_pfn(l2e);
992 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
993 {
994 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
995 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
996 }
997 }
999 if ( setup_compat_arg_xlat(smp_processor_id(),
1000 cpu_to_node[0]) )
1001 panic("Could not setup argument translation area");
1004 long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
1006 struct xen_machphys_mfn_list xmml;
1007 l3_pgentry_t l3e;
1008 l2_pgentry_t l2e;
1009 unsigned long v;
1010 xen_pfn_t mfn, last_mfn;
1011 unsigned int i;
1012 long rc = 0;
1014 switch ( op )
1016 case XENMEM_machphys_mfn_list:
1017 if ( copy_from_guest(&xmml, arg, 1) )
1018 return -EFAULT;
1020 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
1021 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
1022 for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0;
1023 (i != xmml.max_extents) &&
1024 (v < (unsigned long)(machine_to_phys_mapping + max_page));
1025 i++, v += 1UL << L2_PAGETABLE_SHIFT )
1027 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
1028 l3_table_offset(v)];
1029 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1030 mfn = last_mfn;
1031 else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
1033 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
1034 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
1035 mfn = l2e_get_pfn(l2e);
1036 else
1037 mfn = last_mfn;
1039 else
1041 mfn = l3e_get_pfn(l3e)
1042 + (l2_table_offset(v) << PAGETABLE_ORDER);
1044 ASSERT(mfn);
1045 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
1046 return -EFAULT;
1047 last_mfn = mfn;
1050 xmml.nr_extents = i;
1051 if ( copy_to_guest(arg, &xmml, 1) )
1052 return -EFAULT;
1054 break;
1056 default:
1057 rc = -ENOSYS;
1058 break;
1061 return rc;
1064 long do_stack_switch(unsigned long ss, unsigned long esp)
1066 fixup_guest_stack_selector(current->domain, ss);
1067 current->arch.guest_context.kernel_ss = ss;
1068 current->arch.guest_context.kernel_sp = esp;
1069 return 0;
1072 long do_set_segment_base(unsigned int which, unsigned long base)
1074 struct vcpu *v = current;
1075 long ret = 0;
1077 switch ( which )
1079 case SEGBASE_FS:
1080 if ( wrmsr_safe(MSR_FS_BASE, base, base>>32) )
1081 ret = -EFAULT;
1082 else
1083 v->arch.guest_context.fs_base = base;
1084 break;
1086 case SEGBASE_GS_USER:
1087 if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base, base>>32) )
1088 ret = -EFAULT;
1089 else
1090 v->arch.guest_context.gs_base_user = base;
1091 break;
1093 case SEGBASE_GS_KERNEL:
1094 if ( wrmsr_safe(MSR_GS_BASE, base, base>>32) )
1095 ret = -EFAULT;
1096 else
1097 v->arch.guest_context.gs_base_kernel = base;
1098 break;
1100 case SEGBASE_GS_USER_SEL:
1101 __asm__ __volatile__ (
1102 " swapgs \n"
1103 "1: movl %k0,%%gs \n"
1104 " "safe_swapgs" \n"
1105 ".section .fixup,\"ax\" \n"
1106 "2: xorl %k0,%k0 \n"
1107 " jmp 1b \n"
1108 ".previous \n"
1109 ".section __ex_table,\"a\"\n"
1110 " .align 8 \n"
1111 " .quad 1b,2b \n"
1112 ".previous "
1113 : : "r" (base&0xffff) );
1114 break;
1116 default:
1117 ret = -EINVAL;
1118 break;
1121 return ret;
1125 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
1126 int check_descriptor(const struct domain *dom, struct desc_struct *d)
1128 u32 a = d->a, b = d->b;
1129 u16 cs;
1130 unsigned int dpl;
1132 /* A not-present descriptor will always fault, so is safe. */
1133 if ( !(b & _SEGMENT_P) )
1134 goto good;
1136 /* Check and fix up the DPL. */
1137 dpl = (b >> 13) & 3;
1138 __fixup_guest_selector(dom, dpl);
1139 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
1141 /* All code and data segments are okay. No base/limit checking. */
1142 if ( (b & _SEGMENT_S) )
1144 if ( is_pv_32bit_domain(dom) )
1146 unsigned long base, limit;
1148 if ( b & _SEGMENT_L )
1149 goto bad;
1151 /*
1152 * Older PAE Linux guests use segments which are limited to
1153 * 0xf6800000. Extend these to allow access to the larger read-only
1154 * M2P table available in 32on64 mode.
1155 */
1156 base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16);
1158 limit = (b & 0xf0000) | (a & 0xffff);
1159 limit++; /* We add one because limit is inclusive. */
1161 if ( (b & _SEGMENT_G) )
1162 limit <<= 12;
1164 if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
1166 a |= 0x0000ffff;
1167 b |= 0x000f0000;
1171 goto good;
1174 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
1175 if ( (b & _SEGMENT_TYPE) == 0x000 )
1176 goto good;
1178 /* Everything but a call gate is discarded here. */
1179 if ( (b & _SEGMENT_TYPE) != 0xc00 )
1180 goto bad;
1182 /* Validate the target code selector. */
1183 cs = a >> 16;
1184 if ( !guest_gate_selector_okay(dom, cs) )
1185 goto bad;
1186 /*
1187 * Force DPL to zero, causing a GP fault with its error code indicating
1188 * the gate in use, allowing emulation. This is necessary because with
1189 * native guests (kernel in ring 3) call gates cannot be used directly
1190 * to transition from user to kernel mode (and whether a gate is used
1191 * to enter the kernel can only be determined when the gate is being
1192 * used), and with compat guests call gates cannot be used at all as
1193 * there are only 64-bit ones.
1194 * Store the original DPL in the selector's RPL field.
1195 */
1196 b &= ~_SEGMENT_DPL;
1197 cs = (cs & ~3) | dpl;
1198 a = (a & 0xffffU) | (cs << 16);
1200 /* Reserved bits must be zero. */
1201 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
1202 goto bad;
1204 good:
1205 d->a = a;
1206 d->b = b;
1207 return 1;
1208 bad:
1209 return 0;
1212 int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs)
1214 struct domain *d = current->domain;
1216 if (guest_mode(regs) &&
1217 is_pv_32bit_domain(d) &&
1218 ((addr >= HYPERVISOR_COMPAT_VIRT_START(d)) &&
1219 (addr < MACH2PHYS_COMPAT_VIRT_END)) )
1220 return 1;
1221 return 0;
1224 int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
1226 struct domain *d = current->domain;
1227 l4_pgentry_t *pl4e = NULL;
1228 l4_pgentry_t l4e;
1229 l3_pgentry_t *pl3e = NULL;
1230 l3_pgentry_t l3e;
1231 l2_pgentry_t *pl2e = NULL;
1232 l2_pgentry_t l2e, idle_l2e;
1233 unsigned long mfn, idle_index;
1234 int ret = 0;
1236 if (!is_pv_32on64_domain(d))
1237 return 0;
1239 if ((addr < HYPERVISOR_COMPAT_VIRT_START(d)) ||
1240 (addr > MACH2PHYS_COMPAT_VIRT_END) )
1241 return 0;
1243 mfn = (read_cr3()) >> PAGE_SHIFT;
1245 pl4e = map_domain_page(mfn);
1247 l4e = pl4e[addr];
1249 if (!(l4e_get_flags(l4e) & _PAGE_PRESENT))
1250 goto unmap;
1252 mfn = l4e_get_pfn(l4e);
1253 /* We don't need get page type here since it is current CR3 */
1254 pl3e = map_domain_page(mfn);
1256 l3e = pl3e[3];
1258 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1259 goto unmap;
1261 mfn = l3e_get_pfn(l3e);
1262 pl2e = map_domain_page(mfn);
1264 l2e = pl2e[l2_table_offset(addr)];
1266 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT))
1267 goto unmap;
1269 idle_index = (l2_table_offset(addr) -
1270 COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d))/
1271 sizeof(l2_pgentry_t);
1272 idle_l2e = compat_idle_pg_table_l2[idle_index];
1273 if (!(l2e_get_flags(idle_l2e) & _PAGE_PRESENT))
1274 goto unmap;
1276 memcpy(&pl2e[l2_table_offset(addr)],
1277 &compat_idle_pg_table_l2[idle_index],
1278 sizeof(l2_pgentry_t));
1280 ret = EXCRET_fault_fixed;
1282 unmap:
1283 if ( pl4e )
1284 unmap_domain_page(pl4e);
1285 if ( pl3e )
1286 unmap_domain_page(pl3e);
1287 if ( pl2e )
1288 unmap_domain_page(pl2e);
1290 return ret;
1293 void domain_set_alloc_bitsize(struct domain *d)
1295 if ( !is_pv_32on64_domain(d) ||
1296 (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
1297 d->arch.physaddr_bitsize > 0 )
1298 return;
1299 d->arch.physaddr_bitsize =
1300 /* 2^n entries can be contained in guest's p2m mapping space */
1301 fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
1302 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
1303 + PAGE_SHIFT;
1306 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
1308 if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
1309 return bits;
1310 return min(d->arch.physaddr_bitsize, bits);
1313 int transfer_pages_to_heap(struct mem_hotadd_info *info)
1315 unsigned long i;
1316 struct page_info *pg;
1318 /*
1319 * Mark the allocated page before put free pages to buddy allocator
1320 * to avoid merge in free_heap_pages
1321 */
1322 for (i = info->spfn; i < info->cur; i++)
1324 pg = mfn_to_page(i);
1325 pg->count_info = PGC_state_inuse;
1328 init_domheap_pages(pfn_to_paddr(info->cur), pfn_to_paddr(info->epfn));
1330 return 0;
1333 int mem_hotadd_check(unsigned long spfn, unsigned long epfn)
1335 unsigned long s, e, length;
1337 if ( (spfn >= epfn) || (spfn < max_page) )
1338 return 0;
1340 if ( (spfn | epfn) & ((1UL << PAGETABLE_ORDER) - 1) )
1341 return 0;
1343 if ( (spfn | epfn) & pfn_hole_mask )
1344 return 0;
1346 /* Caculate at most required m2p/compat m2p/frametable pages */
1347 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1));
1348 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 3)) - 1) &
1349 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1);
1351 length = (e - s) * sizeof(unsigned long);
1353 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1));
1354 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) &
1355 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1);
1357 e = min_t(unsigned long, e,
1358 (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2);
1360 if ( e > s )
1361 length += (e -s) * sizeof(unsigned int);
1363 s = pfn_to_pdx(spfn) & ~(PDX_GROUP_COUNT - 1);
1364 e = ( pfn_to_pdx(epfn) + (PDX_GROUP_COUNT - 1) ) & ~(PDX_GROUP_COUNT - 1);
1366 length += (e - s) * sizeof(struct page_info);
1368 if ((length >> PAGE_SHIFT) > (epfn - spfn))
1369 return 0;
1371 return 1;
1374 /*
1375 * A bit paranoid for memory allocation failure issue since
1376 * it may be reason for memory add
1377 */
1378 int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
1380 struct mem_hotadd_info info;
1381 int ret, node;
1382 unsigned long old_max = max_page, old_total = total_pages;
1383 unsigned long old_node_start, old_node_span, orig_online;
1384 unsigned long i;
1386 dprintk(XENLOG_INFO, "memory_add %lx ~ %lx with pxm %x\n", spfn, epfn, pxm);
1388 if ( !mem_hotadd_check(spfn, epfn) )
1389 return -EINVAL;
1391 if ( (node = setup_node(pxm)) == -1 )
1392 return -EINVAL;
1394 if ( !valid_numa_range(spfn << PAGE_SHIFT, epfn << PAGE_SHIFT, node) )
1396 dprintk(XENLOG_WARNING, "spfn %lx ~ epfn %lx pxm %x node %x"
1397 "is not numa valid", spfn, epfn, pxm, node);
1398 return -EINVAL;
1401 ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), spfn,
1402 epfn - spfn, PAGE_HYPERVISOR);
1403 if ( ret )
1404 return ret;
1406 old_node_start = NODE_DATA(node)->node_start_pfn;
1407 old_node_span = NODE_DATA(node)->node_spanned_pages;
1408 orig_online = node_online(node);
1410 if ( !orig_online )
1412 dprintk(XENLOG_WARNING, "node %x pxm %x is not online\n",node, pxm);
1413 NODE_DATA(node)->node_id = node;
1414 NODE_DATA(node)->node_start_pfn = spfn;
1415 NODE_DATA(node)->node_spanned_pages =
1416 epfn - node_start_pfn(node);
1417 node_set_online(node);
1418 }else
1420 if (NODE_DATA(node)->node_start_pfn > spfn)
1421 NODE_DATA(node)->node_start_pfn = spfn;
1422 if (node_end_pfn(node) < epfn)
1423 NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node);
1426 ret = -EINVAL;
1427 info.spfn = spfn;
1428 info.epfn = epfn;
1429 info.cur = spfn;
1431 ret = extend_frame_table(&info);
1432 if (ret)
1433 goto destroy_frametable;
1435 /* Set max_page as setup_m2p_table will use it*/
1436 max_page = epfn;
1437 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1438 total_pages += epfn - spfn;
1440 set_pdx_range(spfn, epfn);
1441 ret = setup_m2p_table(&info);
1443 if ( ret )
1444 goto destroy_m2p;
1446 for ( i = old_max; i < epfn; i++ )
1447 if ( iommu_map_page(dom0, i, i) )
1448 break;
1450 if ( i != epfn )
1451 goto destroy_iommu;
1453 /* We can't revert any more */
1454 transfer_pages_to_heap(&info);
1456 share_hotadd_m2p_table(&info);
1458 return 0;
1460 destroy_iommu:
1461 while (i-- > old_max)
1462 iommu_unmap_page(dom0, i);
1464 destroy_m2p:
1465 destroy_m2p_mapping(&info);
1466 max_page = old_max;
1467 total_pages = old_total;
1468 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1469 destroy_frametable:
1470 cleanup_frame_table(&info);
1471 destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1472 (unsigned long)mfn_to_virt(epfn));
1474 if ( !orig_online )
1475 node_set_offline(node);
1476 NODE_DATA(node)->node_start_pfn = old_node_start;
1477 NODE_DATA(node)->node_spanned_pages = old_node_span;
1479 destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1480 (unsigned long)mfn_to_virt(epfn));
1481 return ret;
1484 #include "compat/mm.c"
1486 /*
1487 * Local variables:
1488 * mode: C
1489 * c-set-style: "BSD"
1490 * c-basic-offset: 4
1491 * tab-width: 4
1492 * indent-tabs-mode: nil
1493 * End:
1494 */