debuggers.hg

view xen/include/asm-x86/paging.h @ 19826:2f9e1348aa98

x86_64: allow more vCPU-s per guest

Since the shared info layout is fixed, guests are required to use
VCPUOP_register_vcpu_info prior to booting any vCPU beyond the
traditional limit of 32.

MAX_VIRT_CPUS, being an implemetation detail of the hypervisor, is no
longer being exposed in the public headers.

The tools changes are clearly incomplete (and done only so things
would
build again), and the current state of the tools (using scalar
variables all over the place to represent vCPU bitmaps) very likely
doesn't permit booting DomU-s with more than the traditional number of
vCPU-s. Testing of the extended functionality was done with Dom0 (96
vCPU-s, as well as 128 vCPU-s out of which the kernel elected - by way
of a simple kernel side patch - to use only some, resulting in a
sparse
bitmap).

ia64 changes only to make things build, and build-tested only (and the
tools part only as far as the build would go without encountering
unrelated problems in the blktap code).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:14:16 2009 +0100 (2009-06-18)
parents 43833a6d50a5
children 809b20f066fb
line source
1 /******************************************************************************
2 * include/asm-x86/paging.h
3 *
4 * Common interface for paging support
5 * Copyright (c) 2007 Advanced Micro Devices (Wei Huang)
6 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
7 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
8 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
25 #ifndef _XEN_PAGING_H
26 #define _XEN_PAGING_H
28 #include <xen/mm.h>
29 #include <public/domctl.h>
30 #include <xen/sched.h>
31 #include <xen/perfc.h>
32 #include <xen/domain_page.h>
33 #include <asm/flushtlb.h>
34 #include <asm/domain.h>
36 /*****************************************************************************
37 * Macros to tell which paging mode a domain is in */
39 #define PG_SH_shift 20
40 #define PG_HAP_shift 21
41 /* We're in one of the shadow modes */
42 #define PG_SH_enable (1U << PG_SH_shift)
43 #define PG_HAP_enable (1U << PG_HAP_shift)
45 /* common paging mode bits */
46 #define PG_mode_shift 10
47 /* Refcounts based on shadow tables instead of guest tables */
48 #define PG_refcounts (XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT << PG_mode_shift)
49 /* Enable log dirty mode */
50 #define PG_log_dirty (XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY << PG_mode_shift)
51 /* Xen does p2m translation, not guest */
52 #define PG_translate (XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE << PG_mode_shift)
53 /* Xen does not steal address space from the domain for its own booking;
54 * requires VT or similar mechanisms */
55 #define PG_external (XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL << PG_mode_shift)
57 #define paging_mode_enabled(_d) ((_d)->arch.paging.mode)
58 #define paging_mode_shadow(_d) ((_d)->arch.paging.mode & PG_SH_enable)
59 #define paging_mode_hap(_d) ((_d)->arch.paging.mode & PG_HAP_enable)
61 #define paging_mode_refcounts(_d) ((_d)->arch.paging.mode & PG_refcounts)
62 #define paging_mode_log_dirty(_d) ((_d)->arch.paging.mode & PG_log_dirty)
63 #define paging_mode_translate(_d) ((_d)->arch.paging.mode & PG_translate)
64 #define paging_mode_external(_d) ((_d)->arch.paging.mode & PG_external)
66 /* flags used for paging debug */
67 #define PAGING_DEBUG_LOGDIRTY 0
69 /*****************************************************************************
70 * Mode-specific entry points into the shadow code.
71 *
72 * These shouldn't be used directly by callers; rather use the functions
73 * below which will indirect through this table as appropriate. */
75 struct sh_emulate_ctxt;
76 struct shadow_paging_mode {
77 void (*detach_old_tables )(struct vcpu *v);
78 int (*x86_emulate_write )(struct vcpu *v, unsigned long va,
79 void *src, u32 bytes,
80 struct sh_emulate_ctxt *sh_ctxt);
81 int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va,
82 unsigned long old,
83 unsigned long new,
84 unsigned int bytes,
85 struct sh_emulate_ctxt *sh_ctxt);
86 #ifdef __i386__
87 int (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va,
88 unsigned long old_lo,
89 unsigned long old_hi,
90 unsigned long new_lo,
91 unsigned long new_hi,
92 struct sh_emulate_ctxt *sh_ctxt);
93 #endif
94 mfn_t (*make_monitor_table )(struct vcpu *v);
95 void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn);
96 int (*guess_wrmap )(struct vcpu *v,
97 unsigned long vaddr, mfn_t gmfn);
98 /* For outsiders to tell what mode we're in */
99 unsigned int shadow_levels;
100 };
103 /************************************************/
104 /* common paging interface */
105 /************************************************/
106 struct paging_mode {
107 int (*page_fault )(struct vcpu *v, unsigned long va,
108 struct cpu_user_regs *regs);
109 int (*invlpg )(struct vcpu *v, unsigned long va);
110 unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va,
111 uint32_t *pfec);
112 void (*update_cr3 )(struct vcpu *v, int do_locking);
113 void (*update_paging_modes )(struct vcpu *v);
114 void (*write_p2m_entry )(struct vcpu *v, unsigned long gfn,
115 l1_pgentry_t *p, mfn_t table_mfn,
116 l1_pgentry_t new,
117 unsigned int level);
118 int (*write_guest_entry )(struct vcpu *v, intpte_t *p,
119 intpte_t new, mfn_t gmfn);
120 int (*cmpxchg_guest_entry )(struct vcpu *v, intpte_t *p,
121 intpte_t *old, intpte_t new,
122 mfn_t gmfn);
123 void * (*guest_map_l1e )(struct vcpu *v, unsigned long va,
124 unsigned long *gl1mfn);
125 void (*guest_get_eff_l1e )(struct vcpu *v, unsigned long va,
126 void *eff_l1e);
127 unsigned int guest_levels;
129 /* paging support extension */
130 struct shadow_paging_mode shadow;
131 };
133 /*****************************************************************************
134 * Log dirty code */
136 /* allocate log dirty bitmap resource for recording dirty pages */
137 int paging_alloc_log_dirty_bitmap(struct domain *d);
139 /* free log dirty bitmap resource */
140 void paging_free_log_dirty_bitmap(struct domain *d);
142 /* get the dirty bitmap for a specific range of pfns */
143 int paging_log_dirty_range(struct domain *d,
144 unsigned long begin_pfn,
145 unsigned long nr,
146 XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
148 /* enable log dirty */
149 int paging_log_dirty_enable(struct domain *d);
151 /* disable log dirty */
152 int paging_log_dirty_disable(struct domain *d);
154 /* log dirty initialization */
155 void paging_log_dirty_init(struct domain *d,
156 int (*enable_log_dirty)(struct domain *d),
157 int (*disable_log_dirty)(struct domain *d),
158 void (*clean_dirty_bitmap)(struct domain *d));
160 /* mark a page as dirty */
161 void paging_mark_dirty(struct domain *d, unsigned long guest_mfn);
163 /*
164 * Log-dirty radix tree indexing:
165 * All tree nodes are PAGE_SIZE bytes, mapped on-demand.
166 * Leaf nodes are simple bitmaps; 1 bit per guest pfn.
167 * Interior nodes are arrays of LOGDIRTY_NODE_ENTRIES mfns.
168 * TODO: Dynamic radix tree height. Most guests will only need 2 levels.
169 * The fourth level is basically unusable on 32-bit Xen.
170 * TODO2: Abstract out the radix-tree mechanics?
171 */
172 #define LOGDIRTY_NODE_ENTRIES (1 << PAGETABLE_ORDER)
173 #define L1_LOGDIRTY_IDX(pfn) ((pfn) & ((1 << (PAGE_SHIFT+3)) - 1))
174 #define L2_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3)) & \
175 (LOGDIRTY_NODE_ENTRIES-1))
176 #define L3_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER)) & \
177 (LOGDIRTY_NODE_ENTRIES-1))
178 #if BITS_PER_LONG == 64
179 #define L4_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER*2)) & \
180 (LOGDIRTY_NODE_ENTRIES-1))
181 #else
182 #define L4_LOGDIRTY_IDX(pfn) 0
183 #endif
185 /* VRAM dirty tracking support */
186 struct sh_dirty_vram {
187 unsigned long begin_pfn;
188 unsigned long end_pfn;
189 paddr_t *sl1ma;
190 uint8_t *dirty_bitmap;
191 s_time_t last_dirty;
192 };
194 /*****************************************************************************
195 * Entry points into the paging-assistance code */
197 /* Initialize the paging resource for vcpu struct. It is called by
198 * vcpu_initialise() in domain.c */
199 void paging_vcpu_init(struct vcpu *v);
201 /* Set up the paging-assistance-specific parts of a domain struct at
202 * start of day. Called for every domain from arch_domain_create() */
203 int paging_domain_init(struct domain *d);
205 /* Handler for paging-control ops: operations from user-space to enable
206 * and disable ephemeral shadow modes (test mode and log-dirty mode) and
207 * manipulate the log-dirty bitmap. */
208 int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
209 XEN_GUEST_HANDLE(void) u_domctl);
211 /* Call when destroying a domain */
212 void paging_teardown(struct domain *d);
214 /* Call once all of the references to the domain have gone away */
215 void paging_final_teardown(struct domain *d);
217 /* Enable an arbitrary paging-assistance mode. Call once at domain
218 * creation. */
219 int paging_enable(struct domain *d, u32 mode);
222 /* Page fault handler
223 * Called from pagefault handler in Xen, and from the HVM trap handlers
224 * for pagefaults. Returns 1 if this fault was an artefact of the
225 * paging code (and the guest should retry) or 0 if it is not (and the
226 * fault should be handled elsewhere or passed to the guest).
227 *
228 * Note: under shadow paging, this function handles all page faults;
229 * however, for hardware-assisted paging, this function handles only
230 * host page faults (i.e. nested page faults). */
231 static inline int
232 paging_fault(unsigned long va, struct cpu_user_regs *regs)
233 {
234 struct vcpu *v = current;
235 return v->arch.paging.mode->page_fault(v, va, regs);
236 }
238 /* Handle invlpg requests on vcpus.
239 * Returns 1 if the invlpg instruction should be issued on the hardware,
240 * or 0 if it's safe not to do so. */
241 static inline int paging_invlpg(struct vcpu *v, unsigned long va)
242 {
243 return v->arch.paging.mode->invlpg(v, va);
244 }
246 /* Translate a guest virtual address to the frame number that the
247 * *guest* pagetables would map it to. Returns INVALID_GFN if the guest
248 * tables don't map this address for this kind of access.
249 * pfec[0] is used to determine which kind of access this is when
250 * walking the tables. The caller should set the PFEC_page_present bit
251 * in pfec[0]; in the failure case, that bit will be cleared if appropriate. */
252 #define INVALID_GFN (-1UL)
253 static inline unsigned long paging_gva_to_gfn(struct vcpu *v,
254 unsigned long va,
255 uint32_t *pfec)
256 {
257 return v->arch.paging.mode->gva_to_gfn(v, va, pfec);
258 }
260 /* Update all the things that are derived from the guest's CR3.
261 * Called when the guest changes CR3; the caller can then use v->arch.cr3
262 * as the value to load into the host CR3 to schedule this vcpu */
263 static inline void paging_update_cr3(struct vcpu *v)
264 {
265 v->arch.paging.mode->update_cr3(v, 1);
266 }
268 /* Update all the things that are derived from the guest's CR0/CR3/CR4.
269 * Called to initialize paging structures if the paging mode
270 * has changed, and when bringing up a VCPU for the first time. */
271 static inline void paging_update_paging_modes(struct vcpu *v)
272 {
273 v->arch.paging.mode->update_paging_modes(v);
274 }
277 /* Write a new value into the guest pagetable, and update the
278 * paging-assistance state appropriately. Returns 0 if we page-faulted,
279 * 1 for success. */
280 static inline int paging_write_guest_entry(struct vcpu *v, intpte_t *p,
281 intpte_t new, mfn_t gmfn)
282 {
283 if ( unlikely(paging_mode_enabled(v->domain)
284 && v->arch.paging.mode != NULL) )
285 return v->arch.paging.mode->write_guest_entry(v, p, new, gmfn);
286 else
287 return (!__copy_to_user(p, &new, sizeof(new)));
288 }
291 /* Cmpxchg a new value into the guest pagetable, and update the
292 * paging-assistance state appropriately. Returns 0 if we page-faulted,
293 * 1 if not. N.B. caller should check the value of "old" to see if the
294 * cmpxchg itself was successful. */
295 static inline int paging_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
296 intpte_t *old, intpte_t new,
297 mfn_t gmfn)
298 {
299 if ( unlikely(paging_mode_enabled(v->domain)
300 && v->arch.paging.mode != NULL) )
301 return v->arch.paging.mode->cmpxchg_guest_entry(v, p, old, new, gmfn);
302 else
303 return (!cmpxchg_user(p, *old, new));
304 }
306 /* Helper function that writes a pte in such a way that a concurrent read
307 * never sees a half-written entry that has _PAGE_PRESENT set */
308 static inline void safe_write_pte(l1_pgentry_t *p, l1_pgentry_t new)
309 {
310 #if CONFIG_PAGING_LEVELS == 3
311 /* PAE machines write 64bit PTEs as two 32bit writes. */
312 volatile unsigned long *d = (unsigned long *) p;
313 unsigned long *s = (unsigned long *) &new;
314 BUILD_BUG_ON(sizeof (l1_pgentry_t) != 2 * sizeof (unsigned long));
315 d[0] = 0;
316 d[1] = s[1];
317 d[0] = s[0];
318 #else
319 *p = new;
320 #endif
321 }
323 /* Atomically write a P2M entry and update the paging-assistance state
324 * appropriately.
325 * Arguments: the domain in question, the GFN whose mapping is being updated,
326 * a pointer to the entry to be written, the MFN in which the entry resides,
327 * the new contents of the entry, and the level in the p2m tree at which
328 * we are writing. */
329 static inline void paging_write_p2m_entry(struct domain *d, unsigned long gfn,
330 l1_pgentry_t *p, mfn_t table_mfn,
331 l1_pgentry_t new, unsigned int level)
332 {
333 struct vcpu *v = current;
334 if ( v->domain != d )
335 v = d->vcpu ? d->vcpu[0] : NULL;
336 if ( likely(v && paging_mode_enabled(d) && v->arch.paging.mode != NULL) )
337 {
338 return v->arch.paging.mode->write_p2m_entry(v, gfn, p, table_mfn,
339 new, level);
340 }
341 else
342 safe_write_pte(p, new);
343 }
345 /* Print paging-assistance info to the console */
346 void paging_dump_domain_info(struct domain *d);
347 void paging_dump_vcpu_info(struct vcpu *v);
350 /*****************************************************************************
351 * Access to the guest pagetables */
353 /* Get a mapping of a PV guest's l1e for this virtual address. */
354 static inline l1_pgentry_t *
355 guest_map_l1e(struct vcpu *v, unsigned long addr, unsigned long *gl1mfn)
356 {
357 l2_pgentry_t l2e;
359 if ( unlikely(paging_mode_translate(v->domain)) )
360 return v->arch.paging.mode->guest_map_l1e(v, addr, gl1mfn);
362 /* Find this l1e and its enclosing l1mfn in the linear map */
363 if ( __copy_from_user(&l2e,
364 &__linear_l2_table[l2_linear_offset(addr)],
365 sizeof(l2_pgentry_t)) != 0 )
366 return NULL;
367 /* Check flags that it will be safe to read the l1e */
368 if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE))
369 != _PAGE_PRESENT )
370 return NULL;
371 *gl1mfn = l2e_get_pfn(l2e);
372 return (l1_pgentry_t *)map_domain_page(*gl1mfn) + l1_table_offset(addr);
373 }
375 /* Pull down the mapping we got from guest_map_l1e() */
376 static inline void
377 guest_unmap_l1e(struct vcpu *v, void *p)
378 {
379 unmap_domain_page(p);
380 }
382 /* Read the guest's l1e that maps this address. */
383 static inline void
384 guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
385 {
386 if ( likely(!paging_mode_translate(v->domain)) )
387 {
388 ASSERT(!paging_mode_external(v->domain));
389 if ( __copy_from_user(eff_l1e,
390 &__linear_l1_table[l1_linear_offset(addr)],
391 sizeof(l1_pgentry_t)) != 0 )
392 *(l1_pgentry_t *)eff_l1e = l1e_empty();
393 return;
394 }
396 v->arch.paging.mode->guest_get_eff_l1e(v, addr, eff_l1e);
397 }
399 /* Read the guest's l1e that maps this address, from the kernel-mode
400 * pagetables. */
401 static inline void
402 guest_get_eff_kern_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
403 {
404 #if defined(__x86_64__)
405 int user_mode = !(v->arch.flags & TF_kernel_mode);
406 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
407 #else
408 #define TOGGLE_MODE() ((void)0)
409 #endif
411 TOGGLE_MODE();
412 guest_get_eff_l1e(v, addr, eff_l1e);
413 TOGGLE_MODE();
414 }
418 #endif /* XEN_PAGING_H */
420 /*
421 * Local variables:
422 * mode: C
423 * c-set-style: "BSD"
424 * c-basic-offset: 4
425 * indent-tabs-mode: nil
426 * End:
427 */