/root/src/xen/xen/arch/x86/mm/shadow/common.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * arch/x86/mm/shadow/common.c |
3 | | * |
4 | | * Shadow code that does not need to be multiply compiled. |
5 | | * Parts of this code are Copyright (c) 2006 by XenSource Inc. |
6 | | * Parts of this code are Copyright (c) 2006 by Michael A Fetterman |
7 | | * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. |
8 | | * |
9 | | * This program is free software; you can redistribute it and/or modify |
10 | | * it under the terms of the GNU General Public License as published by |
11 | | * the Free Software Foundation; either version 2 of the License, or |
12 | | * (at your option) any later version. |
13 | | * |
14 | | * This program is distributed in the hope that it will be useful, |
15 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | | * GNU General Public License for more details. |
18 | | * |
19 | | * You should have received a copy of the GNU General Public License |
20 | | * along with this program; If not, see <http://www.gnu.org/licenses/>. |
21 | | */ |
22 | | |
23 | | #include <xen/types.h> |
24 | | #include <xen/mm.h> |
25 | | #include <xen/trace.h> |
26 | | #include <xen/sched.h> |
27 | | #include <xen/perfc.h> |
28 | | #include <xen/irq.h> |
29 | | #include <xen/domain_page.h> |
30 | | #include <xen/guest_access.h> |
31 | | #include <xen/keyhandler.h> |
32 | | #include <asm/event.h> |
33 | | #include <asm/page.h> |
34 | | #include <asm/current.h> |
35 | | #include <asm/flushtlb.h> |
36 | | #include <asm/shadow.h> |
37 | | #include <asm/hvm/ioreq.h> |
38 | | #include <xen/numa.h> |
39 | | #include "private.h" |
40 | | |
41 | | DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags); |
42 | | |
43 | | static int sh_enable_log_dirty(struct domain *, bool log_global); |
44 | | static int sh_disable_log_dirty(struct domain *); |
45 | | static void sh_clean_dirty_bitmap(struct domain *); |
46 | | |
47 | | /* Set up the shadow-specific parts of a domain struct at start of day. |
48 | | * Called for every domain from arch_domain_create() */ |
49 | | int shadow_domain_init(struct domain *d, unsigned int domcr_flags) |
50 | 0 | { |
51 | 0 | static const struct log_dirty_ops sh_ops = { |
52 | 0 | .enable = sh_enable_log_dirty, |
53 | 0 | .disable = sh_disable_log_dirty, |
54 | 0 | .clean = sh_clean_dirty_bitmap, |
55 | 0 | }; |
56 | 0 |
|
57 | 0 | INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.freelist); |
58 | 0 | INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows); |
59 | 0 |
|
60 | 0 | /* Use shadow pagetables for log-dirty support */ |
61 | 0 | paging_log_dirty_init(d, &sh_ops); |
62 | 0 |
|
63 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
64 | 0 | d->arch.paging.shadow.oos_active = 0; |
65 | 0 | d->arch.paging.shadow.oos_off = (domcr_flags & DOMCRF_oos_off) ? 1 : 0; |
66 | 0 | #endif |
67 | 0 | d->arch.paging.shadow.pagetable_dying_op = 0; |
68 | 0 |
|
69 | 0 | return 0; |
70 | 0 | } |
71 | | |
72 | | /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important |
73 | | * job is to initialize the update_paging_modes() function pointer, which is |
74 | | * used to initialized the rest of resources. Therefore, it really does not |
75 | | * matter to have v->arch.paging.mode pointing to any mode, as long as it can |
76 | | * be compiled. |
77 | | */ |
78 | | void shadow_vcpu_init(struct vcpu *v) |
79 | 0 | { |
80 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
81 | 0 | int i, j; |
82 | 0 |
|
83 | 0 | for ( i = 0; i < SHADOW_OOS_PAGES; i++ ) |
84 | 0 | { |
85 | 0 | v->arch.paging.shadow.oos[i] = INVALID_MFN; |
86 | 0 | v->arch.paging.shadow.oos_snapshot[i] = INVALID_MFN; |
87 | 0 | for ( j = 0; j < SHADOW_OOS_FIXUPS; j++ ) |
88 | 0 | v->arch.paging.shadow.oos_fixup[i].smfn[j] = INVALID_MFN; |
89 | 0 | } |
90 | 0 | #endif |
91 | 0 |
|
92 | 0 | v->arch.paging.mode = is_pv_vcpu(v) ? |
93 | 0 | &SHADOW_INTERNAL_NAME(sh_paging_mode, 4) : |
94 | 0 | &SHADOW_INTERNAL_NAME(sh_paging_mode, 3); |
95 | 0 | } |
96 | | |
97 | | #if SHADOW_AUDIT |
98 | | int shadow_audit_enable = 0; |
99 | | |
100 | | static void shadow_audit_key(unsigned char key) |
101 | 0 | { |
102 | 0 | shadow_audit_enable = !shadow_audit_enable; |
103 | 0 | printk("%s shadow_audit_enable=%d\n", |
104 | 0 | __func__, shadow_audit_enable); |
105 | 0 | } |
106 | | |
107 | | static int __init shadow_audit_key_init(void) |
108 | 1 | { |
109 | 1 | register_keyhandler('O', shadow_audit_key, "toggle shadow audits", 0); |
110 | 1 | return 0; |
111 | 1 | } |
112 | | __initcall(shadow_audit_key_init); |
113 | | #endif /* SHADOW_AUDIT */ |
114 | | |
115 | | |
116 | | /**************************************************************************/ |
117 | | /* x86 emulator support for the shadow code |
118 | | */ |
119 | | |
120 | | /* |
121 | | * Callers which pass a known in-range x86_segment can rely on the return |
122 | | * pointer being valid. Other callers must explicitly check for errors. |
123 | | */ |
124 | | static struct segment_register *hvm_get_seg_reg( |
125 | | enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt) |
126 | 0 | { |
127 | 0 | unsigned int idx = seg; |
128 | 0 | struct segment_register *seg_reg; |
129 | 0 |
|
130 | 0 | if ( idx >= ARRAY_SIZE(sh_ctxt->seg_reg) ) |
131 | 0 | return ERR_PTR(-X86EMUL_UNHANDLEABLE); |
132 | 0 |
|
133 | 0 | seg_reg = &sh_ctxt->seg_reg[idx]; |
134 | 0 | if ( !__test_and_set_bit(idx, &sh_ctxt->valid_seg_regs) ) |
135 | 0 | hvm_get_segment_register(current, idx, seg_reg); |
136 | 0 | return seg_reg; |
137 | 0 | } |
138 | | |
139 | | static int hvm_translate_virtual_addr( |
140 | | enum x86_segment seg, |
141 | | unsigned long offset, |
142 | | unsigned int bytes, |
143 | | enum hvm_access_type access_type, |
144 | | struct sh_emulate_ctxt *sh_ctxt, |
145 | | unsigned long *linear) |
146 | 0 | { |
147 | 0 | const struct segment_register *reg; |
148 | 0 | int okay; |
149 | 0 |
|
150 | 0 | reg = hvm_get_seg_reg(seg, sh_ctxt); |
151 | 0 | if ( IS_ERR(reg) ) |
152 | 0 | return -PTR_ERR(reg); |
153 | 0 |
|
154 | 0 | okay = hvm_virtual_to_linear_addr( |
155 | 0 | seg, reg, offset, bytes, access_type, |
156 | 0 | hvm_get_seg_reg(x86_seg_cs, sh_ctxt), linear); |
157 | 0 |
|
158 | 0 | if ( !okay ) |
159 | 0 | { |
160 | 0 | /* |
161 | 0 | * Leave exception injection to the caller for non-user segments: We |
162 | 0 | * neither know the exact error code to be used, nor can we easily |
163 | 0 | * determine the kind of exception (#GP or #TS) in that case. |
164 | 0 | */ |
165 | 0 | if ( is_x86_user_segment(seg) ) |
166 | 0 | x86_emul_hw_exception( |
167 | 0 | (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault, |
168 | 0 | 0, &sh_ctxt->ctxt); |
169 | 0 | return X86EMUL_EXCEPTION; |
170 | 0 | } |
171 | 0 |
|
172 | 0 | return 0; |
173 | 0 | } |
174 | | |
175 | | static int |
176 | | hvm_read(enum x86_segment seg, |
177 | | unsigned long offset, |
178 | | void *p_data, |
179 | | unsigned int bytes, |
180 | | enum hvm_access_type access_type, |
181 | | struct sh_emulate_ctxt *sh_ctxt) |
182 | 0 | { |
183 | 0 | pagefault_info_t pfinfo; |
184 | 0 | unsigned long addr; |
185 | 0 | int rc; |
186 | 0 |
|
187 | 0 | rc = hvm_translate_virtual_addr( |
188 | 0 | seg, offset, bytes, access_type, sh_ctxt, &addr); |
189 | 0 | if ( rc || !bytes ) |
190 | 0 | return rc; |
191 | 0 |
|
192 | 0 | if ( access_type == hvm_access_insn_fetch ) |
193 | 0 | rc = hvm_fetch_from_guest_linear(p_data, addr, bytes, 0, &pfinfo); |
194 | 0 | else |
195 | 0 | rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo); |
196 | 0 |
|
197 | 0 | switch ( rc ) |
198 | 0 | { |
199 | 0 | case HVMTRANS_okay: |
200 | 0 | return X86EMUL_OKAY; |
201 | 0 | case HVMTRANS_bad_linear_to_gfn: |
202 | 0 | x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &sh_ctxt->ctxt); |
203 | 0 | return X86EMUL_EXCEPTION; |
204 | 0 | case HVMTRANS_bad_gfn_to_mfn: |
205 | 0 | case HVMTRANS_unhandleable: |
206 | 0 | return X86EMUL_UNHANDLEABLE; |
207 | 0 | case HVMTRANS_gfn_paged_out: |
208 | 0 | case HVMTRANS_gfn_shared: |
209 | 0 | return X86EMUL_RETRY; |
210 | 0 | } |
211 | 0 |
|
212 | 0 | BUG(); |
213 | 0 | return X86EMUL_UNHANDLEABLE; |
214 | 0 | } |
215 | | |
216 | | static int |
217 | | hvm_emulate_read(enum x86_segment seg, |
218 | | unsigned long offset, |
219 | | void *p_data, |
220 | | unsigned int bytes, |
221 | | struct x86_emulate_ctxt *ctxt) |
222 | 0 | { |
223 | 0 | if ( !is_x86_user_segment(seg) ) |
224 | 0 | return X86EMUL_UNHANDLEABLE; |
225 | 0 | return hvm_read(seg, offset, p_data, bytes, hvm_access_read, |
226 | 0 | container_of(ctxt, struct sh_emulate_ctxt, ctxt)); |
227 | 0 | } |
228 | | |
229 | | static int |
230 | | hvm_emulate_insn_fetch(enum x86_segment seg, |
231 | | unsigned long offset, |
232 | | void *p_data, |
233 | | unsigned int bytes, |
234 | | struct x86_emulate_ctxt *ctxt) |
235 | 0 | { |
236 | 0 | struct sh_emulate_ctxt *sh_ctxt = |
237 | 0 | container_of(ctxt, struct sh_emulate_ctxt, ctxt); |
238 | 0 | unsigned int insn_off = offset - sh_ctxt->insn_buf_eip; |
239 | 0 |
|
240 | 0 | ASSERT(seg == x86_seg_cs); |
241 | 0 |
|
242 | 0 | /* Fall back if requested bytes are not in the prefetch cache. */ |
243 | 0 | if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) ) |
244 | 0 | return hvm_read(seg, offset, p_data, bytes, |
245 | 0 | hvm_access_insn_fetch, sh_ctxt); |
246 | 0 |
|
247 | 0 | /* Hit the cache. Simple memcpy. */ |
248 | 0 | memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes); |
249 | 0 | return X86EMUL_OKAY; |
250 | 0 | } |
251 | | |
252 | | static int |
253 | | hvm_emulate_write(enum x86_segment seg, |
254 | | unsigned long offset, |
255 | | void *p_data, |
256 | | unsigned int bytes, |
257 | | struct x86_emulate_ctxt *ctxt) |
258 | 0 | { |
259 | 0 | struct sh_emulate_ctxt *sh_ctxt = |
260 | 0 | container_of(ctxt, struct sh_emulate_ctxt, ctxt); |
261 | 0 | struct vcpu *v = current; |
262 | 0 | unsigned long addr; |
263 | 0 | int rc; |
264 | 0 |
|
265 | 0 | /* How many emulations could we save if we unshadowed on stack writes? */ |
266 | 0 | if ( seg == x86_seg_ss ) |
267 | 0 | perfc_incr(shadow_fault_emulate_stack); |
268 | 0 |
|
269 | 0 | rc = hvm_translate_virtual_addr( |
270 | 0 | seg, offset, bytes, hvm_access_write, sh_ctxt, &addr); |
271 | 0 | if ( rc || !bytes ) |
272 | 0 | return rc; |
273 | 0 |
|
274 | 0 | return v->arch.paging.mode->shadow.x86_emulate_write( |
275 | 0 | v, addr, p_data, bytes, sh_ctxt); |
276 | 0 | } |
277 | | |
278 | | static int |
279 | | hvm_emulate_cmpxchg(enum x86_segment seg, |
280 | | unsigned long offset, |
281 | | void *p_old, |
282 | | void *p_new, |
283 | | unsigned int bytes, |
284 | | struct x86_emulate_ctxt *ctxt) |
285 | 0 | { |
286 | 0 | struct sh_emulate_ctxt *sh_ctxt = |
287 | 0 | container_of(ctxt, struct sh_emulate_ctxt, ctxt); |
288 | 0 | struct vcpu *v = current; |
289 | 0 | unsigned long addr, old, new; |
290 | 0 | int rc; |
291 | 0 |
|
292 | 0 | if ( bytes > sizeof(long) ) |
293 | 0 | return X86EMUL_UNHANDLEABLE; |
294 | 0 |
|
295 | 0 | rc = hvm_translate_virtual_addr( |
296 | 0 | seg, offset, bytes, hvm_access_write, sh_ctxt, &addr); |
297 | 0 | if ( rc ) |
298 | 0 | return rc; |
299 | 0 |
|
300 | 0 | old = new = 0; |
301 | 0 | memcpy(&old, p_old, bytes); |
302 | 0 | memcpy(&new, p_new, bytes); |
303 | 0 |
|
304 | 0 | return v->arch.paging.mode->shadow.x86_emulate_cmpxchg( |
305 | 0 | v, addr, old, new, bytes, sh_ctxt); |
306 | 0 | } |
307 | | |
308 | | static const struct x86_emulate_ops hvm_shadow_emulator_ops = { |
309 | | .read = hvm_emulate_read, |
310 | | .insn_fetch = hvm_emulate_insn_fetch, |
311 | | .write = hvm_emulate_write, |
312 | | .cmpxchg = hvm_emulate_cmpxchg, |
313 | | .cpuid = hvmemul_cpuid, |
314 | | }; |
315 | | |
316 | | const struct x86_emulate_ops *shadow_init_emulation( |
317 | | struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs) |
318 | 0 | { |
319 | 0 | struct segment_register *creg, *sreg; |
320 | 0 | struct vcpu *v = current; |
321 | 0 | unsigned long addr; |
322 | 0 |
|
323 | 0 | ASSERT(is_hvm_vcpu(v)); |
324 | 0 |
|
325 | 0 | memset(sh_ctxt, 0, sizeof(*sh_ctxt)); |
326 | 0 |
|
327 | 0 | sh_ctxt->ctxt.regs = regs; |
328 | 0 | sh_ctxt->ctxt.vendor = v->domain->arch.cpuid->x86_vendor; |
329 | 0 | sh_ctxt->ctxt.lma = hvm_long_mode_active(v); |
330 | 0 |
|
331 | 0 | /* Segment cache initialisation. Primed with CS. */ |
332 | 0 | creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt); |
333 | 0 |
|
334 | 0 | /* Work out the emulation mode. */ |
335 | 0 | if ( sh_ctxt->ctxt.lma && creg->l ) |
336 | 0 | sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64; |
337 | 0 | else |
338 | 0 | { |
339 | 0 | sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt); |
340 | 0 | sh_ctxt->ctxt.addr_size = creg->db ? 32 : 16; |
341 | 0 | sh_ctxt->ctxt.sp_size = sreg->db ? 32 : 16; |
342 | 0 | } |
343 | 0 |
|
344 | 0 | /* Attempt to prefetch whole instruction. */ |
345 | 0 | sh_ctxt->insn_buf_eip = regs->rip; |
346 | 0 | sh_ctxt->insn_buf_bytes = |
347 | 0 | (!hvm_translate_virtual_addr( |
348 | 0 | x86_seg_cs, regs->rip, sizeof(sh_ctxt->insn_buf), |
349 | 0 | hvm_access_insn_fetch, sh_ctxt, &addr) && |
350 | 0 | !hvm_fetch_from_guest_linear( |
351 | 0 | sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0, NULL)) |
352 | 0 | ? sizeof(sh_ctxt->insn_buf) : 0; |
353 | 0 |
|
354 | 0 | return &hvm_shadow_emulator_ops; |
355 | 0 | } |
356 | | |
357 | | /* Update an initialized emulation context to prepare for the next |
358 | | * instruction */ |
359 | | void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt, |
360 | | struct cpu_user_regs *regs) |
361 | 0 | { |
362 | 0 | struct vcpu *v = current; |
363 | 0 | unsigned long addr, diff; |
364 | 0 |
|
365 | 0 | ASSERT(is_hvm_vcpu(v)); |
366 | 0 |
|
367 | 0 | /* |
368 | 0 | * We don't refetch the segment bases, because we don't emulate |
369 | 0 | * writes to segment registers |
370 | 0 | */ |
371 | 0 | diff = regs->rip - sh_ctxt->insn_buf_eip; |
372 | 0 | if ( diff > sh_ctxt->insn_buf_bytes ) |
373 | 0 | { |
374 | 0 | /* Prefetch more bytes. */ |
375 | 0 | sh_ctxt->insn_buf_bytes = |
376 | 0 | (!hvm_translate_virtual_addr( |
377 | 0 | x86_seg_cs, regs->rip, sizeof(sh_ctxt->insn_buf), |
378 | 0 | hvm_access_insn_fetch, sh_ctxt, &addr) && |
379 | 0 | !hvm_fetch_from_guest_linear( |
380 | 0 | sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0, NULL)) |
381 | 0 | ? sizeof(sh_ctxt->insn_buf) : 0; |
382 | 0 | sh_ctxt->insn_buf_eip = regs->rip; |
383 | 0 | } |
384 | 0 | } |
385 | | |
386 | | |
387 | | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
388 | | /**************************************************************************/ |
389 | | /* Out-of-sync shadows. */ |
390 | | |
391 | | /* From time to time, we let a shadowed pagetable page go out of sync |
392 | | * with its shadow: the guest is allowed to write directly to the page, |
393 | | * and those writes are not synchronously reflected in the shadow. |
394 | | * This lets us avoid many emulations if the guest is writing a lot to a |
395 | | * pagetable, but it relaxes a pretty important invariant in the shadow |
396 | | * pagetable design. Therefore, some rules: |
397 | | * |
398 | | * 1. Only L1 pagetables may go out of sync: any page that is shadowed |
399 | | * at at higher level must be synchronously updated. This makes |
400 | | * using linear shadow pagetables much less dangerous. |
401 | | * That means that: (a) unsyncing code needs to check for higher-level |
402 | | * shadows, and (b) promotion code needs to resync. |
403 | | * |
404 | | * 2. All shadow operations on a guest page require the page to be brought |
405 | | * back into sync before proceeding. This must be done under the |
406 | | * paging lock so that the page is guaranteed to remain synced until |
407 | | * the operation completes. |
408 | | * |
409 | | * Exceptions to this rule: the pagefault and invlpg handlers may |
410 | | * update only one entry on an out-of-sync page without resyncing it. |
411 | | * |
412 | | * 3. Operations on shadows that do not start from a guest page need to |
413 | | * be aware that they may be handling an out-of-sync shadow. |
414 | | * |
415 | | * 4. Operations that do not normally take the paging lock (fast-path |
416 | | * #PF handler, INVLPG) must fall back to a locking, syncing version |
417 | | * if they see an out-of-sync table. |
418 | | * |
419 | | * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG) |
420 | | * must explicitly resync all relevant pages or update their |
421 | | * shadows. |
422 | | * |
423 | | * Currently out-of-sync pages are listed in a simple open-addressed |
424 | | * hash table with a second chance (must resist temptation to radically |
425 | | * over-engineer hash tables...) The virtual address of the access |
426 | | * which caused us to unsync the page is also kept in the hash table, as |
427 | | * a hint for finding the writable mappings later. |
428 | | * |
429 | | * We keep a hash per vcpu, because we want as much as possible to do |
430 | | * the re-sync on the save vcpu we did the unsync on, so the VA hint |
431 | | * will be valid. |
432 | | */ |
433 | | |
434 | | |
435 | | #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL |
436 | | static void sh_oos_audit(struct domain *d) |
437 | | { |
438 | | int idx, expected_idx, expected_idx_alt; |
439 | | struct page_info *pg; |
440 | | struct vcpu *v; |
441 | | |
442 | | for_each_vcpu(d, v) |
443 | | { |
444 | | for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) |
445 | | { |
446 | | mfn_t *oos = v->arch.paging.shadow.oos; |
447 | | if ( !mfn_valid(oos[idx]) ) |
448 | | continue; |
449 | | |
450 | | expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES; |
451 | | expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES); |
452 | | if ( idx != expected_idx && idx != expected_idx_alt ) |
453 | | { |
454 | | printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n", |
455 | | __func__, idx, mfn_x(oos[idx]), |
456 | | expected_idx, expected_idx_alt); |
457 | | BUG(); |
458 | | } |
459 | | pg = mfn_to_page(oos[idx]); |
460 | | if ( !(pg->count_info & PGC_page_table) ) |
461 | | { |
462 | | printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n", |
463 | | __func__, idx, mfn_x(oos[idx]), pg->count_info); |
464 | | BUG(); |
465 | | } |
466 | | if ( !(pg->shadow_flags & SHF_out_of_sync) ) |
467 | | { |
468 | | printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n", |
469 | | __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); |
470 | | BUG(); |
471 | | } |
472 | | if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) ) |
473 | | { |
474 | | printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n", |
475 | | __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); |
476 | | BUG(); |
477 | | } |
478 | | } |
479 | | } |
480 | | } |
481 | | #endif |
482 | | |
483 | | #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES |
484 | | void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) |
485 | 0 | { |
486 | 0 | int idx; |
487 | 0 | struct vcpu *v; |
488 | 0 | mfn_t *oos; |
489 | 0 |
|
490 | 0 | ASSERT(mfn_is_out_of_sync(gmfn)); |
491 | 0 |
|
492 | 0 | for_each_vcpu(d, v) |
493 | 0 | { |
494 | 0 | oos = v->arch.paging.shadow.oos; |
495 | 0 | idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; |
496 | 0 | if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) |
497 | 0 | idx = (idx + 1) % SHADOW_OOS_PAGES; |
498 | 0 |
|
499 | 0 | if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) |
500 | 0 | return; |
501 | 0 | } |
502 | 0 |
|
503 | 0 | SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn)); |
504 | 0 | BUG(); |
505 | 0 | } |
506 | | #endif |
507 | | |
508 | | /* Update the shadow, but keep the page out of sync. */ |
509 | | static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn) |
510 | 0 | { |
511 | 0 | struct page_info *pg = mfn_to_page(gmfn); |
512 | 0 |
|
513 | 0 | ASSERT(mfn_valid(gmfn)); |
514 | 0 | ASSERT(page_is_out_of_sync(pg)); |
515 | 0 |
|
516 | 0 | /* Call out to the appropriate per-mode resyncing function */ |
517 | 0 | if ( pg->shadow_flags & SHF_L1_32 ) |
518 | 0 | SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn); |
519 | 0 | else if ( pg->shadow_flags & SHF_L1_PAE ) |
520 | 0 | SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn); |
521 | 0 | else if ( pg->shadow_flags & SHF_L1_64 ) |
522 | 0 | SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn); |
523 | 0 | } |
524 | | |
525 | | |
526 | | /* |
527 | | * Fixup arrays: We limit the maximum number of writable mappings to |
528 | | * SHADOW_OOS_FIXUPS and store enough information to remove them |
529 | | * quickly on resync. |
530 | | */ |
531 | | |
532 | | static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn, |
533 | | struct oos_fixup *fixup) |
534 | 0 | { |
535 | 0 | struct domain *d = v->domain; |
536 | 0 | int i; |
537 | 0 | for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ ) |
538 | 0 | { |
539 | 0 | if ( !mfn_eq(fixup->smfn[i], INVALID_MFN) ) |
540 | 0 | { |
541 | 0 | sh_remove_write_access_from_sl1p(d, gmfn, |
542 | 0 | fixup->smfn[i], |
543 | 0 | fixup->off[i]); |
544 | 0 | fixup->smfn[i] = INVALID_MFN; |
545 | 0 | } |
546 | 0 | } |
547 | 0 |
|
548 | 0 | /* Always flush the TLBs. See comment on oos_fixup_add(). */ |
549 | 0 | return 1; |
550 | 0 | } |
551 | | |
552 | | void oos_fixup_add(struct domain *d, mfn_t gmfn, |
553 | | mfn_t smfn, unsigned long off) |
554 | 0 | { |
555 | 0 | int idx, next; |
556 | 0 | mfn_t *oos; |
557 | 0 | struct oos_fixup *oos_fixup; |
558 | 0 | struct vcpu *v; |
559 | 0 |
|
560 | 0 | perfc_incr(shadow_oos_fixup_add); |
561 | 0 |
|
562 | 0 | for_each_vcpu(d, v) |
563 | 0 | { |
564 | 0 | oos = v->arch.paging.shadow.oos; |
565 | 0 | oos_fixup = v->arch.paging.shadow.oos_fixup; |
566 | 0 | idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; |
567 | 0 | if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) |
568 | 0 | idx = (idx + 1) % SHADOW_OOS_PAGES; |
569 | 0 | if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) |
570 | 0 | { |
571 | 0 | int i; |
572 | 0 | for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ ) |
573 | 0 | { |
574 | 0 | if ( mfn_valid(oos_fixup[idx].smfn[i]) |
575 | 0 | && (mfn_x(oos_fixup[idx].smfn[i]) == mfn_x(smfn)) |
576 | 0 | && (oos_fixup[idx].off[i] == off) ) |
577 | 0 | return; |
578 | 0 | } |
579 | 0 |
|
580 | 0 | next = oos_fixup[idx].next; |
581 | 0 |
|
582 | 0 | if ( !mfn_eq(oos_fixup[idx].smfn[next], INVALID_MFN) ) |
583 | 0 | { |
584 | 0 | TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT); |
585 | 0 |
|
586 | 0 | /* Reuse this slot and remove current writable mapping. */ |
587 | 0 | sh_remove_write_access_from_sl1p(d, gmfn, |
588 | 0 | oos_fixup[idx].smfn[next], |
589 | 0 | oos_fixup[idx].off[next]); |
590 | 0 | perfc_incr(shadow_oos_fixup_evict); |
591 | 0 | /* We should flush the TLBs now, because we removed a |
592 | 0 | writable mapping, but since the shadow is already |
593 | 0 | OOS we have no problem if another vcpu write to |
594 | 0 | this page table. We just have to be very careful to |
595 | 0 | *always* flush the tlbs on resync. */ |
596 | 0 | } |
597 | 0 |
|
598 | 0 | oos_fixup[idx].smfn[next] = smfn; |
599 | 0 | oos_fixup[idx].off[next] = off; |
600 | 0 | oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS; |
601 | 0 |
|
602 | 0 | TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD); |
603 | 0 | return; |
604 | 0 | } |
605 | 0 | } |
606 | 0 |
|
607 | 0 | SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); |
608 | 0 | BUG(); |
609 | 0 | } |
610 | | |
611 | | static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn, |
612 | | struct oos_fixup *fixup) |
613 | 0 | { |
614 | 0 | struct domain *d = v->domain; |
615 | 0 | int ftlb = 0; |
616 | 0 |
|
617 | 0 | ftlb |= oos_fixup_flush_gmfn(v, gmfn, fixup); |
618 | 0 |
|
619 | 0 | switch ( sh_remove_write_access(d, gmfn, 0, 0) ) |
620 | 0 | { |
621 | 0 | default: |
622 | 0 | case 0: |
623 | 0 | break; |
624 | 0 |
|
625 | 0 | case 1: |
626 | 0 | ftlb |= 1; |
627 | 0 | break; |
628 | 0 |
|
629 | 0 | case -1: |
630 | 0 | /* An unfindable writeable typecount has appeared, probably via a |
631 | 0 | * grant table entry: can't shoot the mapping, so try to unshadow |
632 | 0 | * the page. If that doesn't work either, the guest is granting |
633 | 0 | * his pagetables and must be killed after all. |
634 | 0 | * This will flush the tlb, so we can return with no worries. */ |
635 | 0 | sh_remove_shadows(d, gmfn, 0 /* Be thorough */, 1 /* Must succeed */); |
636 | 0 | return 1; |
637 | 0 | } |
638 | 0 |
|
639 | 0 | if ( ftlb ) |
640 | 0 | flush_tlb_mask(d->domain_dirty_cpumask); |
641 | 0 |
|
642 | 0 | return 0; |
643 | 0 | } |
644 | | |
645 | | |
646 | | static inline void trace_resync(int event, mfn_t gmfn) |
647 | 0 | { |
648 | 0 | if ( tb_init_done ) |
649 | 0 | { |
650 | 0 | /* Convert gmfn to gfn */ |
651 | 0 | unsigned long gfn = mfn_to_gfn(current->domain, gmfn); |
652 | 0 | __trace_var(event, 0/*!tsc*/, sizeof(gfn), &gfn); |
653 | 0 | } |
654 | 0 | } |
655 | | |
656 | | /* Pull all the entries on an out-of-sync page back into sync. */ |
657 | | static void _sh_resync(struct vcpu *v, mfn_t gmfn, |
658 | | struct oos_fixup *fixup, mfn_t snp) |
659 | 0 | { |
660 | 0 | struct page_info *pg = mfn_to_page(gmfn); |
661 | 0 |
|
662 | 0 | ASSERT(paging_locked_by_me(v->domain)); |
663 | 0 | ASSERT(mfn_is_out_of_sync(gmfn)); |
664 | 0 | /* Guest page must be shadowed *only* as L1 when out of sync. */ |
665 | 0 | ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask |
666 | 0 | & ~SHF_L1_ANY)); |
667 | 0 | ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn))); |
668 | 0 |
|
669 | 0 | SHADOW_PRINTK("%pv gmfn=%"PRI_mfn"\n", v, mfn_x(gmfn)); |
670 | 0 |
|
671 | 0 | /* Need to pull write access so the page *stays* in sync. */ |
672 | 0 | if ( oos_remove_write_access(v, gmfn, fixup) ) |
673 | 0 | { |
674 | 0 | /* Page has been unshadowed. */ |
675 | 0 | return; |
676 | 0 | } |
677 | 0 |
|
678 | 0 | /* No more writable mappings of this page, please */ |
679 | 0 | pg->shadow_flags &= ~SHF_oos_may_write; |
680 | 0 |
|
681 | 0 | /* Update the shadows with current guest entries. */ |
682 | 0 | _sh_resync_l1(v, gmfn, snp); |
683 | 0 |
|
684 | 0 | /* Now we know all the entries are synced, and will stay that way */ |
685 | 0 | pg->shadow_flags &= ~SHF_out_of_sync; |
686 | 0 | perfc_incr(shadow_resync); |
687 | 0 | trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn); |
688 | 0 | } |
689 | | |
690 | | |
691 | | /* Add an MFN to the list of out-of-sync guest pagetables */ |
692 | | static void oos_hash_add(struct vcpu *v, mfn_t gmfn) |
693 | 0 | { |
694 | 0 | int i, idx, oidx, swap = 0; |
695 | 0 | void *gptr, *gsnpptr; |
696 | 0 | mfn_t *oos = v->arch.paging.shadow.oos; |
697 | 0 | mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; |
698 | 0 | struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup; |
699 | 0 | struct oos_fixup fixup = { .next = 0 }; |
700 | 0 |
|
701 | 0 | for (i = 0; i < SHADOW_OOS_FIXUPS; i++ ) |
702 | 0 | fixup.smfn[i] = INVALID_MFN; |
703 | 0 |
|
704 | 0 | idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; |
705 | 0 | oidx = idx; |
706 | 0 |
|
707 | 0 | if ( mfn_valid(oos[idx]) |
708 | 0 | && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx ) |
709 | 0 | { |
710 | 0 | /* Punt the current occupant into the next slot */ |
711 | 0 | SWAP(oos[idx], gmfn); |
712 | 0 | SWAP(oos_fixup[idx], fixup); |
713 | 0 | swap = 1; |
714 | 0 | idx = (idx + 1) % SHADOW_OOS_PAGES; |
715 | 0 | } |
716 | 0 | if ( mfn_valid(oos[idx]) ) |
717 | 0 | { |
718 | 0 | /* Crush the current occupant. */ |
719 | 0 | _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]); |
720 | 0 | perfc_incr(shadow_unsync_evict); |
721 | 0 | } |
722 | 0 | oos[idx] = gmfn; |
723 | 0 | oos_fixup[idx] = fixup; |
724 | 0 |
|
725 | 0 | if ( swap ) |
726 | 0 | SWAP(oos_snapshot[idx], oos_snapshot[oidx]); |
727 | 0 |
|
728 | 0 | gptr = map_domain_page(oos[oidx]); |
729 | 0 | gsnpptr = map_domain_page(oos_snapshot[oidx]); |
730 | 0 | memcpy(gsnpptr, gptr, PAGE_SIZE); |
731 | 0 | unmap_domain_page(gptr); |
732 | 0 | unmap_domain_page(gsnpptr); |
733 | 0 | } |
734 | | |
735 | | /* Remove an MFN from the list of out-of-sync guest pagetables */ |
736 | | static void oos_hash_remove(struct domain *d, mfn_t gmfn) |
737 | 0 | { |
738 | 0 | int idx; |
739 | 0 | mfn_t *oos; |
740 | 0 | struct vcpu *v; |
741 | 0 |
|
742 | 0 | SHADOW_PRINTK("d%d gmfn %lx\n", d->domain_id, mfn_x(gmfn)); |
743 | 0 |
|
744 | 0 | for_each_vcpu(d, v) |
745 | 0 | { |
746 | 0 | oos = v->arch.paging.shadow.oos; |
747 | 0 | idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; |
748 | 0 | if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) |
749 | 0 | idx = (idx + 1) % SHADOW_OOS_PAGES; |
750 | 0 | if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) |
751 | 0 | { |
752 | 0 | oos[idx] = INVALID_MFN; |
753 | 0 | return; |
754 | 0 | } |
755 | 0 | } |
756 | 0 |
|
757 | 0 | SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); |
758 | 0 | BUG(); |
759 | 0 | } |
760 | | |
761 | | mfn_t oos_snapshot_lookup(struct domain *d, mfn_t gmfn) |
762 | 0 | { |
763 | 0 | int idx; |
764 | 0 | mfn_t *oos; |
765 | 0 | mfn_t *oos_snapshot; |
766 | 0 | struct vcpu *v; |
767 | 0 |
|
768 | 0 | for_each_vcpu(d, v) |
769 | 0 | { |
770 | 0 | oos = v->arch.paging.shadow.oos; |
771 | 0 | oos_snapshot = v->arch.paging.shadow.oos_snapshot; |
772 | 0 | idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; |
773 | 0 | if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) |
774 | 0 | idx = (idx + 1) % SHADOW_OOS_PAGES; |
775 | 0 | if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) |
776 | 0 | { |
777 | 0 | return oos_snapshot[idx]; |
778 | 0 | } |
779 | 0 | } |
780 | 0 |
|
781 | 0 | SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); |
782 | 0 | BUG(); |
783 | 0 | } |
784 | | |
785 | | /* Pull a single guest page back into sync */ |
786 | | void sh_resync(struct domain *d, mfn_t gmfn) |
787 | 0 | { |
788 | 0 | int idx; |
789 | 0 | mfn_t *oos; |
790 | 0 | mfn_t *oos_snapshot; |
791 | 0 | struct oos_fixup *oos_fixup; |
792 | 0 | struct vcpu *v; |
793 | 0 |
|
794 | 0 | for_each_vcpu(d, v) |
795 | 0 | { |
796 | 0 | oos = v->arch.paging.shadow.oos; |
797 | 0 | oos_fixup = v->arch.paging.shadow.oos_fixup; |
798 | 0 | oos_snapshot = v->arch.paging.shadow.oos_snapshot; |
799 | 0 | idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; |
800 | 0 | if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) |
801 | 0 | idx = (idx + 1) % SHADOW_OOS_PAGES; |
802 | 0 |
|
803 | 0 | if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) |
804 | 0 | { |
805 | 0 | _sh_resync(v, gmfn, &oos_fixup[idx], oos_snapshot[idx]); |
806 | 0 | oos[idx] = INVALID_MFN; |
807 | 0 | return; |
808 | 0 | } |
809 | 0 | } |
810 | 0 |
|
811 | 0 | SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); |
812 | 0 | BUG(); |
813 | 0 | } |
814 | | |
815 | | /* Figure out whether it's definitely safe not to sync this l1 table, |
816 | | * by making a call out to the mode in which that shadow was made. */ |
817 | | static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn) |
818 | 0 | { |
819 | 0 | struct page_info *pg = mfn_to_page(gl1mfn); |
820 | 0 | if ( pg->shadow_flags & SHF_L1_32 ) |
821 | 0 | return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn); |
822 | 0 | else if ( pg->shadow_flags & SHF_L1_PAE ) |
823 | 0 | return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn); |
824 | 0 | else if ( pg->shadow_flags & SHF_L1_64 ) |
825 | 0 | return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn); |
826 | 0 | SHADOW_ERROR("gmfn %#lx was OOS but not shadowed as an l1.\n", |
827 | 0 | mfn_x(gl1mfn)); |
828 | 0 | BUG(); |
829 | 0 | } |
830 | | |
831 | | |
832 | | /* Pull all out-of-sync pages back into sync. Pages brought out of sync |
833 | | * on other vcpus are allowed to remain out of sync, but their contents |
834 | | * will be made safe (TLB flush semantics); pages unsynced by this vcpu |
835 | | * are brought back into sync and write-protected. If skip != 0, we try |
836 | | * to avoid resyncing at all if we think we can get away with it. */ |
837 | | void sh_resync_all(struct vcpu *v, int skip, int this, int others) |
838 | 0 | { |
839 | 0 | int idx; |
840 | 0 | struct vcpu *other; |
841 | 0 | mfn_t *oos = v->arch.paging.shadow.oos; |
842 | 0 | mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; |
843 | 0 | struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup; |
844 | 0 |
|
845 | 0 | SHADOW_PRINTK("%pv\n", v); |
846 | 0 |
|
847 | 0 | ASSERT(paging_locked_by_me(v->domain)); |
848 | 0 |
|
849 | 0 | if ( !this ) |
850 | 0 | goto resync_others; |
851 | 0 |
|
852 | 0 | /* First: resync all of this vcpu's oos pages */ |
853 | 0 | for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) |
854 | 0 | if ( mfn_valid(oos[idx]) ) |
855 | 0 | { |
856 | 0 | /* Write-protect and sync contents */ |
857 | 0 | _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]); |
858 | 0 | oos[idx] = INVALID_MFN; |
859 | 0 | } |
860 | 0 |
|
861 | 0 | resync_others: |
862 | 0 | if ( !others ) |
863 | 0 | return; |
864 | 0 |
|
865 | 0 | /* Second: make all *other* vcpus' oos pages safe. */ |
866 | 0 | for_each_vcpu(v->domain, other) |
867 | 0 | { |
868 | 0 | if ( v == other ) |
869 | 0 | continue; |
870 | 0 |
|
871 | 0 | oos = other->arch.paging.shadow.oos; |
872 | 0 | oos_fixup = other->arch.paging.shadow.oos_fixup; |
873 | 0 | oos_snapshot = other->arch.paging.shadow.oos_snapshot; |
874 | 0 |
|
875 | 0 | for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) |
876 | 0 | { |
877 | 0 | if ( !mfn_valid(oos[idx]) ) |
878 | 0 | continue; |
879 | 0 |
|
880 | 0 | if ( skip ) |
881 | 0 | { |
882 | 0 | /* Update the shadows and leave the page OOS. */ |
883 | 0 | if ( sh_skip_sync(v, oos[idx]) ) |
884 | 0 | continue; |
885 | 0 | trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]); |
886 | 0 | _sh_resync_l1(other, oos[idx], oos_snapshot[idx]); |
887 | 0 | } |
888 | 0 | else |
889 | 0 | { |
890 | 0 | /* Write-protect and sync contents */ |
891 | 0 | _sh_resync(other, oos[idx], &oos_fixup[idx], oos_snapshot[idx]); |
892 | 0 | oos[idx] = INVALID_MFN; |
893 | 0 | } |
894 | 0 | } |
895 | 0 | } |
896 | 0 | } |
897 | | |
898 | | /* Allow a shadowed page to go out of sync. Unsyncs are traced in |
899 | | * multi.c:sh_page_fault() */ |
900 | | int sh_unsync(struct vcpu *v, mfn_t gmfn) |
901 | 0 | { |
902 | 0 | struct page_info *pg; |
903 | 0 |
|
904 | 0 | ASSERT(paging_locked_by_me(v->domain)); |
905 | 0 |
|
906 | 0 | SHADOW_PRINTK("%pv gmfn=%"PRI_mfn"\n", v, mfn_x(gmfn)); |
907 | 0 |
|
908 | 0 | pg = mfn_to_page(gmfn); |
909 | 0 |
|
910 | 0 | /* Guest page must be shadowed *only* as L1 and *only* once when out |
911 | 0 | * of sync. Also, get out now if it's already out of sync. |
912 | 0 | * Also, can't safely unsync if some vcpus have paging disabled.*/ |
913 | 0 | if ( pg->shadow_flags & |
914 | 0 | ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) |
915 | 0 | || sh_page_has_multiple_shadows(pg) |
916 | 0 | || is_pv_vcpu(v) |
917 | 0 | || !v->domain->arch.paging.shadow.oos_active ) |
918 | 0 | return 0; |
919 | 0 |
|
920 | 0 | pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write; |
921 | 0 | oos_hash_add(v, gmfn); |
922 | 0 | perfc_incr(shadow_unsync); |
923 | 0 | TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC); |
924 | 0 | return 1; |
925 | 0 | } |
926 | | |
927 | | #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ |
928 | | |
929 | | |
930 | | /**************************************************************************/ |
931 | | /* Code for "promoting" a guest page to the point where the shadow code is |
932 | | * willing to let it be treated as a guest page table. This generally |
933 | | * involves making sure there are no writable mappings available to the guest |
934 | | * for this page. |
935 | | */ |
936 | | void shadow_promote(struct domain *d, mfn_t gmfn, unsigned int type) |
937 | 0 | { |
938 | 0 | struct page_info *page = mfn_to_page(gmfn); |
939 | 0 |
|
940 | 0 | ASSERT(mfn_valid(gmfn)); |
941 | 0 |
|
942 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
943 | 0 | /* Is the page already shadowed and out of sync? */ |
944 | 0 | if ( page_is_out_of_sync(page) ) |
945 | 0 | sh_resync(d, gmfn); |
946 | 0 | #endif |
947 | 0 |
|
948 | 0 | /* We should never try to promote a gmfn that has writeable mappings */ |
949 | 0 | ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page |
950 | 0 | || (page->u.inuse.type_info & PGT_count_mask) == 0 |
951 | 0 | || d->is_shutting_down); |
952 | 0 |
|
953 | 0 | /* Is the page already shadowed? */ |
954 | 0 | if ( !test_and_set_bit(_PGC_page_table, &page->count_info) ) |
955 | 0 | page->shadow_flags = 0; |
956 | 0 |
|
957 | 0 | ASSERT(!test_bit(type, &page->shadow_flags)); |
958 | 0 | set_bit(type, &page->shadow_flags); |
959 | 0 | TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE); |
960 | 0 | } |
961 | | |
962 | | void shadow_demote(struct domain *d, mfn_t gmfn, u32 type) |
963 | 0 | { |
964 | 0 | struct page_info *page = mfn_to_page(gmfn); |
965 | 0 |
|
966 | 0 | ASSERT(test_bit(_PGC_page_table, &page->count_info)); |
967 | 0 | ASSERT(test_bit(type, &page->shadow_flags)); |
968 | 0 |
|
969 | 0 | clear_bit(type, &page->shadow_flags); |
970 | 0 |
|
971 | 0 | if ( (page->shadow_flags & SHF_page_type_mask) == 0 ) |
972 | 0 | { |
973 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
974 | 0 | /* Was the page out of sync? */ |
975 | 0 | if ( page_is_out_of_sync(page) ) |
976 | 0 | { |
977 | 0 | oos_hash_remove(d, gmfn); |
978 | 0 | } |
979 | 0 | #endif |
980 | 0 | clear_bit(_PGC_page_table, &page->count_info); |
981 | 0 | } |
982 | 0 |
|
983 | 0 | TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE); |
984 | 0 | } |
985 | | |
986 | | /**************************************************************************/ |
987 | | /* Validate a pagetable change from the guest and update the shadows. |
988 | | * Returns a bitmask of SHADOW_SET_* flags. */ |
989 | | |
990 | | int |
991 | | sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size) |
992 | 0 | { |
993 | 0 | int result = 0; |
994 | 0 | struct page_info *page = mfn_to_page(gmfn); |
995 | 0 |
|
996 | 0 | paging_mark_dirty(v->domain, gmfn); |
997 | 0 |
|
998 | 0 | // Determine which types of shadows are affected, and update each. |
999 | 0 | // |
1000 | 0 | // Always validate L1s before L2s to prevent another cpu with a linear |
1001 | 0 | // mapping of this gmfn from seeing a walk that results from |
1002 | 0 | // using the new L2 value and the old L1 value. (It is OK for such a |
1003 | 0 | // guest to see a walk that uses the old L2 value with the new L1 value, |
1004 | 0 | // as hardware could behave this way if one level of the pagewalk occurs |
1005 | 0 | // before the store, and the next level of the pagewalk occurs after the |
1006 | 0 | // store. |
1007 | 0 | // |
1008 | 0 | // Ditto for L2s before L3s, etc. |
1009 | 0 | // |
1010 | 0 |
|
1011 | 0 | if ( !(page->count_info & PGC_page_table) ) |
1012 | 0 | return 0; /* Not shadowed at all */ |
1013 | 0 |
|
1014 | 0 | if ( page->shadow_flags & SHF_L1_32 ) |
1015 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2) |
1016 | 0 | (v, gmfn, entry, size); |
1017 | 0 | if ( page->shadow_flags & SHF_L2_32 ) |
1018 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2) |
1019 | 0 | (v, gmfn, entry, size); |
1020 | 0 |
|
1021 | 0 | if ( page->shadow_flags & SHF_L1_PAE ) |
1022 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3) |
1023 | 0 | (v, gmfn, entry, size); |
1024 | 0 | if ( page->shadow_flags & SHF_L2_PAE ) |
1025 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3) |
1026 | 0 | (v, gmfn, entry, size); |
1027 | 0 | if ( page->shadow_flags & SHF_L2H_PAE ) |
1028 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3) |
1029 | 0 | (v, gmfn, entry, size); |
1030 | 0 |
|
1031 | 0 | if ( page->shadow_flags & SHF_L1_64 ) |
1032 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4) |
1033 | 0 | (v, gmfn, entry, size); |
1034 | 0 | if ( page->shadow_flags & SHF_L2_64 ) |
1035 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4) |
1036 | 0 | (v, gmfn, entry, size); |
1037 | 0 | if ( page->shadow_flags & SHF_L2H_64 ) |
1038 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4) |
1039 | 0 | (v, gmfn, entry, size); |
1040 | 0 | if ( page->shadow_flags & SHF_L3_64 ) |
1041 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4) |
1042 | 0 | (v, gmfn, entry, size); |
1043 | 0 | if ( page->shadow_flags & SHF_L4_64 ) |
1044 | 0 | result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4) |
1045 | 0 | (v, gmfn, entry, size); |
1046 | 0 |
|
1047 | 0 | this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED)); |
1048 | 0 |
|
1049 | 0 | return result; |
1050 | 0 | } |
1051 | | |
1052 | | |
1053 | | void |
1054 | | sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, |
1055 | | void *entry, u32 size) |
1056 | | /* This is the entry point for emulated writes to pagetables in HVM guests and |
1057 | | * PV translated guests. |
1058 | | */ |
1059 | 0 | { |
1060 | 0 | struct domain *d = v->domain; |
1061 | 0 | int rc; |
1062 | 0 |
|
1063 | 0 | ASSERT(paging_locked_by_me(v->domain)); |
1064 | 0 | rc = sh_validate_guest_entry(v, gmfn, entry, size); |
1065 | 0 | if ( rc & SHADOW_SET_FLUSH ) |
1066 | 0 | /* Need to flush TLBs to pick up shadow PT changes */ |
1067 | 0 | flush_tlb_mask(d->domain_dirty_cpumask); |
1068 | 0 | if ( rc & SHADOW_SET_ERROR ) |
1069 | 0 | { |
1070 | 0 | /* This page is probably not a pagetable any more: tear it out of the |
1071 | 0 | * shadows, along with any tables that reference it. |
1072 | 0 | * Since the validate call above will have made a "safe" (i.e. zero) |
1073 | 0 | * shadow entry, we can let the domain live even if we can't fully |
1074 | 0 | * unshadow the page. */ |
1075 | 0 | sh_remove_shadows(d, gmfn, 0, 0); |
1076 | 0 | } |
1077 | 0 | } |
1078 | | |
1079 | | |
1080 | | /**************************************************************************/ |
1081 | | /* Memory management for shadow pages. */ |
1082 | | |
1083 | | /* Allocating shadow pages |
1084 | | * ----------------------- |
1085 | | * |
1086 | | * Most shadow pages are allocated singly, but there is one case where |
1087 | | * we need to allocate multiple pages together: shadowing 32-bit guest |
1088 | | * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB |
1089 | | * of virtual address space, and needs to be shadowed by two PAE/64-bit |
1090 | | * l1 tables (covering 2MB of virtual address space each). Similarly, a |
1091 | | * 32-bit guest l2 table (4GB va) needs to be shadowed by four |
1092 | | * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are |
1093 | | * not contiguous in memory; functions for handling offsets into them are |
1094 | | * defined in shadow/multi.c (shadow_l1_index() etc.) |
1095 | | * |
1096 | | * This table shows the allocation behaviour of the different modes: |
1097 | | * |
1098 | | * Xen paging 64b 64b 64b |
1099 | | * Guest paging 32b pae 64b |
1100 | | * PV or HVM HVM HVM * |
1101 | | * Shadow paging pae pae 64b |
1102 | | * |
1103 | | * sl1 size 8k 4k 4k |
1104 | | * sl2 size 16k 4k 4k |
1105 | | * sl3 size - - 4k |
1106 | | * sl4 size - - 4k |
1107 | | * |
1108 | | * In HVM guests, the p2m table is built out of shadow pages, and we provide |
1109 | | * a function for the p2m management to steal pages, in max-order chunks, from |
1110 | | * the free pool. |
1111 | | */ |
1112 | | |
1113 | | const u8 sh_type_to_size[] = { |
1114 | | 1, /* SH_type_none */ |
1115 | | 2, /* SH_type_l1_32_shadow */ |
1116 | | 2, /* SH_type_fl1_32_shadow */ |
1117 | | 4, /* SH_type_l2_32_shadow */ |
1118 | | 1, /* SH_type_l1_pae_shadow */ |
1119 | | 1, /* SH_type_fl1_pae_shadow */ |
1120 | | 1, /* SH_type_l2_pae_shadow */ |
1121 | | 1, /* SH_type_l2h_pae_shadow */ |
1122 | | 1, /* SH_type_l1_64_shadow */ |
1123 | | 1, /* SH_type_fl1_64_shadow */ |
1124 | | 1, /* SH_type_l2_64_shadow */ |
1125 | | 1, /* SH_type_l2h_64_shadow */ |
1126 | | 1, /* SH_type_l3_64_shadow */ |
1127 | | 1, /* SH_type_l4_64_shadow */ |
1128 | | 1, /* SH_type_p2m_table */ |
1129 | | 1, /* SH_type_monitor_table */ |
1130 | | 1 /* SH_type_oos_snapshot */ |
1131 | | }; |
1132 | | |
1133 | | /* Figure out the least acceptable quantity of shadow memory. |
1134 | | * The minimum memory requirement for always being able to free up a |
1135 | | * chunk of memory is very small -- only three max-order chunks per |
1136 | | * vcpu to hold the top level shadows and pages with Xen mappings in them. |
1137 | | * |
1138 | | * But for a guest to be guaranteed to successfully execute a single |
1139 | | * instruction, we must be able to map a large number (about thirty) VAs |
1140 | | * at the same time, which means that to guarantee progress, we must |
1141 | | * allow for more than ninety allocated pages per vcpu. We round that |
1142 | | * up to 128 pages, or half a megabyte per vcpu, and add 1 more vcpu's |
1143 | | * worth to make sure we never return zero. */ |
1144 | | static unsigned int shadow_min_acceptable_pages(struct domain *d) |
1145 | 0 | { |
1146 | 0 | u32 vcpu_count = 1; |
1147 | 0 | struct vcpu *v; |
1148 | 0 |
|
1149 | 0 | for_each_vcpu(d, v) |
1150 | 0 | vcpu_count++; |
1151 | 0 |
|
1152 | 0 | return (vcpu_count * 128); |
1153 | 0 | } |
1154 | | |
1155 | | /* Dispatcher function: call the per-mode function that will unhook the |
1156 | | * non-Xen mappings in this top-level shadow mfn. With user_only == 1, |
1157 | | * unhooks only the user-mode mappings. */ |
1158 | | void shadow_unhook_mappings(struct domain *d, mfn_t smfn, int user_only) |
1159 | 0 | { |
1160 | 0 | struct page_info *sp = mfn_to_page(smfn); |
1161 | 0 | switch ( sp->u.sh.type ) |
1162 | 0 | { |
1163 | 0 | case SH_type_l2_32_shadow: |
1164 | 0 | SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(d, smfn, user_only); |
1165 | 0 | break; |
1166 | 0 | case SH_type_l2_pae_shadow: |
1167 | 0 | case SH_type_l2h_pae_shadow: |
1168 | 0 | SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(d, smfn, user_only); |
1169 | 0 | break; |
1170 | 0 | case SH_type_l4_64_shadow: |
1171 | 0 | SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(d, smfn, user_only); |
1172 | 0 | break; |
1173 | 0 | default: |
1174 | 0 | SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->u.sh.type); |
1175 | 0 | BUG(); |
1176 | 0 | } |
1177 | 0 | } |
1178 | | |
1179 | | static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn) |
1180 | 0 | { |
1181 | 0 | if ( tb_init_done ) |
1182 | 0 | { |
1183 | 0 | /* Convert smfn to gfn */ |
1184 | 0 | unsigned long gfn; |
1185 | 0 | ASSERT(mfn_valid(smfn)); |
1186 | 0 | gfn = mfn_to_gfn(d, backpointer(mfn_to_page(smfn))); |
1187 | 0 | __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/, sizeof(gfn), &gfn); |
1188 | 0 | } |
1189 | 0 | } |
1190 | | |
1191 | | /* Make sure there are at least count order-sized pages |
1192 | | * available in the shadow page pool. */ |
1193 | | static void _shadow_prealloc(struct domain *d, unsigned int pages) |
1194 | 0 | { |
1195 | 0 | struct vcpu *v; |
1196 | 0 | struct page_info *sp, *t; |
1197 | 0 | mfn_t smfn; |
1198 | 0 | int i; |
1199 | 0 |
|
1200 | 0 | if ( d->arch.paging.shadow.free_pages >= pages ) return; |
1201 | 0 |
|
1202 | 0 | /* Shouldn't have enabled shadows if we've no vcpus. */ |
1203 | 0 | ASSERT(d->vcpu && d->vcpu[0]); |
1204 | 0 |
|
1205 | 0 | /* Stage one: walk the list of pinned pages, unpinning them */ |
1206 | 0 | perfc_incr(shadow_prealloc_1); |
1207 | 0 | foreach_pinned_shadow(d, sp, t) |
1208 | 0 | { |
1209 | 0 | smfn = page_to_mfn(sp); |
1210 | 0 |
|
1211 | 0 | /* Unpin this top-level shadow */ |
1212 | 0 | trace_shadow_prealloc_unpin(d, smfn); |
1213 | 0 | sh_unpin(d, smfn); |
1214 | 0 |
|
1215 | 0 | /* See if that freed up enough space */ |
1216 | 0 | if ( d->arch.paging.shadow.free_pages >= pages ) return; |
1217 | 0 | } |
1218 | 0 |
|
1219 | 0 | /* Stage two: all shadow pages are in use in hierarchies that are |
1220 | 0 | * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen |
1221 | 0 | * mappings. */ |
1222 | 0 | perfc_incr(shadow_prealloc_2); |
1223 | 0 |
|
1224 | 0 | for_each_vcpu(d, v) |
1225 | 0 | for ( i = 0 ; i < 4 ; i++ ) |
1226 | 0 | { |
1227 | 0 | if ( !pagetable_is_null(v->arch.shadow_table[i]) ) |
1228 | 0 | { |
1229 | 0 | TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK); |
1230 | 0 | shadow_unhook_mappings(d, |
1231 | 0 | pagetable_get_mfn(v->arch.shadow_table[i]), 0); |
1232 | 0 |
|
1233 | 0 | /* See if that freed up enough space */ |
1234 | 0 | if ( d->arch.paging.shadow.free_pages >= pages ) |
1235 | 0 | { |
1236 | 0 | flush_tlb_mask(d->domain_dirty_cpumask); |
1237 | 0 | return; |
1238 | 0 | } |
1239 | 0 | } |
1240 | 0 | } |
1241 | 0 |
|
1242 | 0 | /* Nothing more we can do: all remaining shadows are of pages that |
1243 | 0 | * hold Xen mappings for some vcpu. This can never happen. */ |
1244 | 0 | SHADOW_ERROR("Can't pre-allocate %u shadow pages!\n" |
1245 | 0 | " shadow pages total = %u, free = %u, p2m=%u\n", |
1246 | 0 | pages, |
1247 | 0 | d->arch.paging.shadow.total_pages, |
1248 | 0 | d->arch.paging.shadow.free_pages, |
1249 | 0 | d->arch.paging.shadow.p2m_pages); |
1250 | 0 | BUG(); |
1251 | 0 | } |
1252 | | |
1253 | | /* Make sure there are at least count pages of the order according to |
1254 | | * type available in the shadow page pool. |
1255 | | * This must be called before any calls to shadow_alloc(). Since this |
1256 | | * will free existing shadows to make room, it must be called early enough |
1257 | | * to avoid freeing shadows that the caller is currently working on. */ |
1258 | | void shadow_prealloc(struct domain *d, u32 type, unsigned int count) |
1259 | 0 | { |
1260 | 0 | return _shadow_prealloc(d, shadow_size(type) * count); |
1261 | 0 | } |
1262 | | |
1263 | | /* Deliberately free all the memory we can: this will tear down all of |
1264 | | * this domain's shadows */ |
1265 | | static void shadow_blow_tables(struct domain *d) |
1266 | 0 | { |
1267 | 0 | struct page_info *sp, *t; |
1268 | 0 | struct vcpu *v; |
1269 | 0 | mfn_t smfn; |
1270 | 0 | int i; |
1271 | 0 |
|
1272 | 0 | /* Shouldn't have enabled shadows if we've no vcpus. */ |
1273 | 0 | ASSERT(d->vcpu && d->vcpu[0]); |
1274 | 0 |
|
1275 | 0 | /* Pass one: unpin all pinned pages */ |
1276 | 0 | foreach_pinned_shadow(d, sp, t) |
1277 | 0 | { |
1278 | 0 | smfn = page_to_mfn(sp); |
1279 | 0 | sh_unpin(d, smfn); |
1280 | 0 | } |
1281 | 0 |
|
1282 | 0 | /* Second pass: unhook entries of in-use shadows */ |
1283 | 0 | for_each_vcpu(d, v) |
1284 | 0 | for ( i = 0 ; i < 4 ; i++ ) |
1285 | 0 | if ( !pagetable_is_null(v->arch.shadow_table[i]) ) |
1286 | 0 | shadow_unhook_mappings(d, |
1287 | 0 | pagetable_get_mfn(v->arch.shadow_table[i]), 0); |
1288 | 0 |
|
1289 | 0 | /* Make sure everyone sees the unshadowings */ |
1290 | 0 | flush_tlb_mask(d->domain_dirty_cpumask); |
1291 | 0 | } |
1292 | | |
1293 | | void shadow_blow_tables_per_domain(struct domain *d) |
1294 | 0 | { |
1295 | 0 | if ( shadow_mode_enabled(d) && d->vcpu != NULL && d->vcpu[0] != NULL ) { |
1296 | 0 | paging_lock(d); |
1297 | 0 | shadow_blow_tables(d); |
1298 | 0 | paging_unlock(d); |
1299 | 0 | } |
1300 | 0 | } |
1301 | | |
1302 | | #ifndef NDEBUG |
1303 | | /* Blow all shadows of all shadowed domains: this can be used to cause the |
1304 | | * guest's pagetables to be re-shadowed if we suspect that the shadows |
1305 | | * have somehow got out of sync */ |
1306 | | static void shadow_blow_all_tables(unsigned char c) |
1307 | 0 | { |
1308 | 0 | struct domain *d; |
1309 | 0 | printk("'%c' pressed -> blowing all shadow tables\n", c); |
1310 | 0 | rcu_read_lock(&domlist_read_lock); |
1311 | 0 | for_each_domain(d) |
1312 | 0 | { |
1313 | 0 | if ( shadow_mode_enabled(d) && d->vcpu != NULL && d->vcpu[0] != NULL ) |
1314 | 0 | { |
1315 | 0 | paging_lock(d); |
1316 | 0 | shadow_blow_tables(d); |
1317 | 0 | paging_unlock(d); |
1318 | 0 | } |
1319 | 0 | } |
1320 | 0 | rcu_read_unlock(&domlist_read_lock); |
1321 | 0 | } |
1322 | | |
1323 | | /* Register this function in the Xen console keypress table */ |
1324 | | static __init int shadow_blow_tables_keyhandler_init(void) |
1325 | 1 | { |
1326 | 1 | register_keyhandler('S', shadow_blow_all_tables, "reset shadow pagetables", 1); |
1327 | 1 | return 0; |
1328 | 1 | } |
1329 | | __initcall(shadow_blow_tables_keyhandler_init); |
1330 | | #endif /* !NDEBUG */ |
1331 | | |
1332 | | /* Accessors for the singly-linked list that's used for hash chains */ |
1333 | | static inline struct page_info * |
1334 | | next_shadow(const struct page_info *sp) |
1335 | 0 | { |
1336 | 0 | return sp->next_shadow ? pdx_to_page(sp->next_shadow) : NULL; |
1337 | 0 | } |
1338 | | |
1339 | | static inline void |
1340 | | set_next_shadow(struct page_info *sp, struct page_info *next) |
1341 | 0 | { |
1342 | 0 | sp->next_shadow = next ? page_to_pdx(next) : 0; |
1343 | 0 | } |
1344 | | |
1345 | | /* Allocate another shadow's worth of (contiguous, aligned) pages, |
1346 | | * and fill in the type and backpointer fields of their page_infos. |
1347 | | * Never fails to allocate. */ |
1348 | | mfn_t shadow_alloc(struct domain *d, |
1349 | | u32 shadow_type, |
1350 | | unsigned long backpointer) |
1351 | 0 | { |
1352 | 0 | struct page_info *sp = NULL; |
1353 | 0 | unsigned int pages = shadow_size(shadow_type); |
1354 | 0 | struct page_list_head tmp_list; |
1355 | 0 | cpumask_t mask; |
1356 | 0 | unsigned int i; |
1357 | 0 |
|
1358 | 0 | ASSERT(paging_locked_by_me(d)); |
1359 | 0 | ASSERT(shadow_type != SH_type_none); |
1360 | 0 | perfc_incr(shadow_alloc); |
1361 | 0 |
|
1362 | 0 | if ( d->arch.paging.shadow.free_pages < pages ) |
1363 | 0 | { |
1364 | 0 | /* If we get here, we failed to allocate. This should never |
1365 | 0 | * happen. It means that we didn't call shadow_prealloc() |
1366 | 0 | * correctly before we allocated. We can't recover by calling |
1367 | 0 | * prealloc here, because we might free up higher-level pages |
1368 | 0 | * that the caller is working on. */ |
1369 | 0 | SHADOW_ERROR("Can't allocate %i shadow pages!\n", pages); |
1370 | 0 | BUG(); |
1371 | 0 | } |
1372 | 0 | d->arch.paging.shadow.free_pages -= pages; |
1373 | 0 |
|
1374 | 0 | /* Backpointers that are MFNs need to be packed into PDXs (PFNs don't) */ |
1375 | 0 | switch (shadow_type) |
1376 | 0 | { |
1377 | 0 | case SH_type_fl1_32_shadow: |
1378 | 0 | case SH_type_fl1_pae_shadow: |
1379 | 0 | case SH_type_fl1_64_shadow: |
1380 | 0 | break; |
1381 | 0 | default: |
1382 | 0 | backpointer = pfn_to_pdx(backpointer); |
1383 | 0 | break; |
1384 | 0 | } |
1385 | 0 |
|
1386 | 0 | INIT_PAGE_LIST_HEAD(&tmp_list); |
1387 | 0 |
|
1388 | 0 | /* Init page info fields and clear the pages */ |
1389 | 0 | for ( i = 0; i < pages ; i++ ) |
1390 | 0 | { |
1391 | 0 | sp = page_list_remove_head(&d->arch.paging.shadow.freelist); |
1392 | 0 | /* Before we overwrite the old contents of this page, |
1393 | 0 | * we need to be sure that no TLB holds a pointer to it. */ |
1394 | 0 | cpumask_copy(&mask, d->domain_dirty_cpumask); |
1395 | 0 | tlbflush_filter(&mask, sp->tlbflush_timestamp); |
1396 | 0 | if ( unlikely(!cpumask_empty(&mask)) ) |
1397 | 0 | { |
1398 | 0 | perfc_incr(shadow_alloc_tlbflush); |
1399 | 0 | flush_tlb_mask(&mask); |
1400 | 0 | } |
1401 | 0 | /* Now safe to clear the page for reuse */ |
1402 | 0 | clear_domain_page(page_to_mfn(sp)); |
1403 | 0 | INIT_PAGE_LIST_ENTRY(&sp->list); |
1404 | 0 | page_list_add(sp, &tmp_list); |
1405 | 0 | sp->u.sh.type = shadow_type; |
1406 | 0 | sp->u.sh.pinned = 0; |
1407 | 0 | sp->u.sh.count = 0; |
1408 | 0 | sp->u.sh.head = 0; |
1409 | 0 | sp->v.sh.back = backpointer; |
1410 | 0 | set_next_shadow(sp, NULL); |
1411 | 0 | perfc_incr(shadow_alloc_count); |
1412 | 0 | } |
1413 | 0 | if ( shadow_type >= SH_type_min_shadow |
1414 | 0 | && shadow_type <= SH_type_max_shadow ) |
1415 | 0 | sp->u.sh.head = 1; |
1416 | 0 |
|
1417 | 0 | sh_terminate_list(&tmp_list); |
1418 | 0 |
|
1419 | 0 | return page_to_mfn(sp); |
1420 | 0 | } |
1421 | | |
1422 | | |
1423 | | /* Return some shadow pages to the pool. */ |
1424 | | void shadow_free(struct domain *d, mfn_t smfn) |
1425 | 0 | { |
1426 | 0 | struct page_info *next = NULL, *sp = mfn_to_page(smfn); |
1427 | 0 | struct page_list_head *pin_list; |
1428 | 0 | unsigned int pages; |
1429 | 0 | u32 shadow_type; |
1430 | 0 | int i; |
1431 | 0 |
|
1432 | 0 | ASSERT(paging_locked_by_me(d)); |
1433 | 0 | perfc_incr(shadow_free); |
1434 | 0 |
|
1435 | 0 | shadow_type = sp->u.sh.type; |
1436 | 0 | ASSERT(shadow_type != SH_type_none); |
1437 | 0 | ASSERT(sp->u.sh.head || (shadow_type > SH_type_max_shadow)); |
1438 | 0 | pages = shadow_size(shadow_type); |
1439 | 0 | pin_list = &d->arch.paging.shadow.pinned_shadows; |
1440 | 0 |
|
1441 | 0 | for ( i = 0; i < pages; i++ ) |
1442 | 0 | { |
1443 | 0 | #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION) |
1444 | 0 | struct vcpu *v; |
1445 | 0 | for_each_vcpu(d, v) |
1446 | 0 | { |
1447 | 0 | #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC |
1448 | 0 | /* No longer safe to look for a writeable mapping in this shadow */ |
1449 | 0 | if ( v->arch.paging.shadow.last_writeable_pte_smfn |
1450 | 0 | == mfn_x(page_to_mfn(sp)) ) |
1451 | 0 | v->arch.paging.shadow.last_writeable_pte_smfn = 0; |
1452 | 0 | #endif |
1453 | 0 | #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION |
1454 | 0 | v->arch.paging.last_write_emul_ok = 0; |
1455 | 0 | #endif |
1456 | 0 | } |
1457 | 0 | #endif |
1458 | 0 | /* Get the next page before we overwrite the list header */ |
1459 | 0 | if ( i < pages - 1 ) |
1460 | 0 | next = page_list_next(sp, pin_list); |
1461 | 0 | /* Strip out the type: this is now a free shadow page */ |
1462 | 0 | sp->u.sh.type = sp->u.sh.head = 0; |
1463 | 0 | /* Remember the TLB timestamp so we will know whether to flush |
1464 | 0 | * TLBs when we reuse the page. Because the destructors leave the |
1465 | 0 | * contents of the pages in place, we can delay TLB flushes until |
1466 | 0 | * just before the allocator hands the page out again. */ |
1467 | 0 | page_set_tlbflush_timestamp(sp); |
1468 | 0 | perfc_decr(shadow_alloc_count); |
1469 | 0 | page_list_add_tail(sp, &d->arch.paging.shadow.freelist); |
1470 | 0 | sp = next; |
1471 | 0 | } |
1472 | 0 |
|
1473 | 0 | d->arch.paging.shadow.free_pages += pages; |
1474 | 0 | } |
1475 | | |
1476 | | /* Divert a page from the pool to be used by the p2m mapping. |
1477 | | * This action is irreversible: the p2m mapping only ever grows. |
1478 | | * That's OK because the p2m table only exists for translated domains, |
1479 | | * and those domains can't ever turn off shadow mode. */ |
1480 | | static struct page_info * |
1481 | | shadow_alloc_p2m_page(struct domain *d) |
1482 | 0 | { |
1483 | 0 | struct page_info *pg; |
1484 | 0 |
|
1485 | 0 | /* This is called both from the p2m code (which never holds the |
1486 | 0 | * paging lock) and the log-dirty code (which always does). */ |
1487 | 0 | paging_lock_recursive(d); |
1488 | 0 |
|
1489 | 0 | if ( d->arch.paging.shadow.total_pages |
1490 | 0 | < shadow_min_acceptable_pages(d) + 1 ) |
1491 | 0 | { |
1492 | 0 | if ( !d->arch.paging.p2m_alloc_failed ) |
1493 | 0 | { |
1494 | 0 | d->arch.paging.p2m_alloc_failed = 1; |
1495 | 0 | dprintk(XENLOG_ERR, "d%i failed to allocate from shadow pool\n", |
1496 | 0 | d->domain_id); |
1497 | 0 | } |
1498 | 0 | paging_unlock(d); |
1499 | 0 | return NULL; |
1500 | 0 | } |
1501 | 0 |
|
1502 | 0 | shadow_prealloc(d, SH_type_p2m_table, 1); |
1503 | 0 | pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); |
1504 | 0 | d->arch.paging.shadow.p2m_pages++; |
1505 | 0 | d->arch.paging.shadow.total_pages--; |
1506 | 0 |
|
1507 | 0 | paging_unlock(d); |
1508 | 0 |
|
1509 | 0 | /* Unlike shadow pages, mark p2m pages as owned by the domain. |
1510 | 0 | * Marking the domain as the owner would normally allow the guest to |
1511 | 0 | * create mappings of these pages, but these p2m pages will never be |
1512 | 0 | * in the domain's guest-physical address space, and so that is not |
1513 | 0 | * believed to be a concern. */ |
1514 | 0 | page_set_owner(pg, d); |
1515 | 0 | pg->count_info |= 1; |
1516 | 0 | return pg; |
1517 | 0 | } |
1518 | | |
1519 | | static void |
1520 | | shadow_free_p2m_page(struct domain *d, struct page_info *pg) |
1521 | 0 | { |
1522 | 0 | ASSERT(page_get_owner(pg) == d); |
1523 | 0 | /* Should have just the one ref we gave it in alloc_p2m_page() */ |
1524 | 0 | if ( (pg->count_info & PGC_count_mask) != 1 ) |
1525 | 0 | { |
1526 | 0 | SHADOW_ERROR("Odd p2m page count c=%#lx t=%"PRtype_info"\n", |
1527 | 0 | pg->count_info, pg->u.inuse.type_info); |
1528 | 0 | } |
1529 | 0 | pg->count_info &= ~PGC_count_mask; |
1530 | 0 | pg->u.sh.type = SH_type_p2m_table; /* p2m code reuses type-info */ |
1531 | 0 | page_set_owner(pg, NULL); |
1532 | 0 |
|
1533 | 0 | /* This is called both from the p2m code (which never holds the |
1534 | 0 | * paging lock) and the log-dirty code (which always does). */ |
1535 | 0 | paging_lock_recursive(d); |
1536 | 0 |
|
1537 | 0 | shadow_free(d, page_to_mfn(pg)); |
1538 | 0 | d->arch.paging.shadow.p2m_pages--; |
1539 | 0 | d->arch.paging.shadow.total_pages++; |
1540 | 0 |
|
1541 | 0 | paging_unlock(d); |
1542 | 0 | } |
1543 | | |
1544 | | int shadow_set_allocation(struct domain *d, unsigned int pages, bool *preempted) |
1545 | 0 | { |
1546 | 0 | struct page_info *sp; |
1547 | 0 | unsigned int lower_bound; |
1548 | 0 |
|
1549 | 0 | ASSERT(paging_locked_by_me(d)); |
1550 | 0 |
|
1551 | 0 | if ( pages > 0 ) |
1552 | 0 | { |
1553 | 0 | /* Check for minimum value. */ |
1554 | 0 | if ( pages < d->arch.paging.shadow.p2m_pages ) |
1555 | 0 | pages = 0; |
1556 | 0 | else |
1557 | 0 | pages -= d->arch.paging.shadow.p2m_pages; |
1558 | 0 |
|
1559 | 0 | /* Don't allocate less than the minimum acceptable, plus one page per |
1560 | 0 | * megabyte of RAM (for the p2m table) */ |
1561 | 0 | lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256); |
1562 | 0 | if ( pages < lower_bound ) |
1563 | 0 | pages = lower_bound; |
1564 | 0 | } |
1565 | 0 |
|
1566 | 0 | SHADOW_PRINTK("current %i target %i\n", |
1567 | 0 | d->arch.paging.shadow.total_pages, pages); |
1568 | 0 |
|
1569 | 0 | for ( ; ; ) |
1570 | 0 | { |
1571 | 0 | if ( d->arch.paging.shadow.total_pages < pages ) |
1572 | 0 | { |
1573 | 0 | /* Need to allocate more memory from domheap */ |
1574 | 0 | sp = (struct page_info *) |
1575 | 0 | alloc_domheap_page(d, MEMF_no_owner); |
1576 | 0 | if ( sp == NULL ) |
1577 | 0 | { |
1578 | 0 | SHADOW_PRINTK("failed to allocate shadow pages.\n"); |
1579 | 0 | return -ENOMEM; |
1580 | 0 | } |
1581 | 0 | d->arch.paging.shadow.free_pages++; |
1582 | 0 | d->arch.paging.shadow.total_pages++; |
1583 | 0 | sp->u.sh.type = 0; |
1584 | 0 | sp->u.sh.pinned = 0; |
1585 | 0 | sp->u.sh.count = 0; |
1586 | 0 | sp->tlbflush_timestamp = 0; /* Not in any TLB */ |
1587 | 0 | page_list_add_tail(sp, &d->arch.paging.shadow.freelist); |
1588 | 0 | } |
1589 | 0 | else if ( d->arch.paging.shadow.total_pages > pages ) |
1590 | 0 | { |
1591 | 0 | /* Need to return memory to domheap */ |
1592 | 0 | _shadow_prealloc(d, 1); |
1593 | 0 | sp = page_list_remove_head(&d->arch.paging.shadow.freelist); |
1594 | 0 | ASSERT(sp); |
1595 | 0 | /* |
1596 | 0 | * The pages were allocated anonymously, but the owner field |
1597 | 0 | * gets overwritten normally, so need to clear it here. |
1598 | 0 | */ |
1599 | 0 | page_set_owner(sp, NULL); |
1600 | 0 | d->arch.paging.shadow.free_pages--; |
1601 | 0 | d->arch.paging.shadow.total_pages--; |
1602 | 0 | free_domheap_page(sp); |
1603 | 0 | } |
1604 | 0 | else |
1605 | 0 | break; |
1606 | 0 |
|
1607 | 0 | /* Check to see if we need to yield and try again */ |
1608 | 0 | if ( preempted && general_preempt_check() ) |
1609 | 0 | { |
1610 | 0 | *preempted = true; |
1611 | 0 | return 0; |
1612 | 0 | } |
1613 | 0 | } |
1614 | 0 |
|
1615 | 0 | return 0; |
1616 | 0 | } |
1617 | | |
1618 | | /* Return the size of the shadow pool, rounded up to the nearest MB */ |
1619 | | static unsigned int shadow_get_allocation(struct domain *d) |
1620 | 0 | { |
1621 | 0 | unsigned int pg = d->arch.paging.shadow.total_pages |
1622 | 0 | + d->arch.paging.shadow.p2m_pages; |
1623 | 0 | return ((pg >> (20 - PAGE_SHIFT)) |
1624 | 0 | + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0)); |
1625 | 0 | } |
1626 | | |
1627 | | /**************************************************************************/ |
1628 | | /* Handling guest writes to pagetables. */ |
1629 | | |
1630 | | /* |
1631 | | * Translate a VA to an MFN, injecting a page-fault if we fail. If the |
1632 | | * mapping succeeds, a reference will be held on the underlying page. |
1633 | | */ |
1634 | 0 | #define BAD_GVA_TO_GFN (~0UL) |
1635 | 0 | #define BAD_GFN_TO_MFN (~1UL) |
1636 | 0 | #define READONLY_GFN (~2UL) |
1637 | | static mfn_t emulate_gva_to_mfn(struct vcpu *v, unsigned long vaddr, |
1638 | | struct sh_emulate_ctxt *sh_ctxt) |
1639 | 0 | { |
1640 | 0 | unsigned long gfn; |
1641 | 0 | struct page_info *page; |
1642 | 0 | mfn_t mfn; |
1643 | 0 | p2m_type_t p2mt; |
1644 | 0 | uint32_t pfec = PFEC_page_present | PFEC_write_access; |
1645 | 0 |
|
1646 | 0 | /* Translate the VA to a GFN. */ |
1647 | 0 | gfn = paging_get_hostmode(v)->gva_to_gfn(v, NULL, vaddr, &pfec); |
1648 | 0 | if ( gfn == gfn_x(INVALID_GFN) ) |
1649 | 0 | { |
1650 | 0 | x86_emul_pagefault(pfec, vaddr, &sh_ctxt->ctxt); |
1651 | 0 |
|
1652 | 0 | return _mfn(BAD_GVA_TO_GFN); |
1653 | 0 | } |
1654 | 0 |
|
1655 | 0 | /* Translate the GFN to an MFN. */ |
1656 | 0 | ASSERT(!paging_locked_by_me(v->domain)); |
1657 | 0 |
|
1658 | 0 | page = get_page_from_gfn(v->domain, gfn, &p2mt, P2M_ALLOC); |
1659 | 0 |
|
1660 | 0 | /* Sanity checking. */ |
1661 | 0 | if ( page == NULL ) |
1662 | 0 | { |
1663 | 0 | return _mfn(BAD_GFN_TO_MFN); |
1664 | 0 | } |
1665 | 0 | if ( p2m_is_discard_write(p2mt) ) |
1666 | 0 | { |
1667 | 0 | put_page(page); |
1668 | 0 | return _mfn(READONLY_GFN); |
1669 | 0 | } |
1670 | 0 | if ( !p2m_is_ram(p2mt) ) |
1671 | 0 | { |
1672 | 0 | put_page(page); |
1673 | 0 | return _mfn(BAD_GFN_TO_MFN); |
1674 | 0 | } |
1675 | 0 | mfn = page_to_mfn(page); |
1676 | 0 | ASSERT(mfn_valid(mfn)); |
1677 | 0 |
|
1678 | 0 | v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn); |
1679 | 0 |
|
1680 | 0 | return mfn; |
1681 | 0 | } |
1682 | | |
1683 | | /* |
1684 | | * Check that the user is allowed to perform this write. If a mapping is |
1685 | | * returned, page references will be held on sh_ctxt->mfn[0] and |
1686 | | * sh_ctxt->mfn[1] iff !INVALID_MFN. |
1687 | | */ |
1688 | | void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr, |
1689 | | unsigned int bytes, |
1690 | | struct sh_emulate_ctxt *sh_ctxt) |
1691 | 0 | { |
1692 | 0 | struct domain *d = v->domain; |
1693 | 0 | void *map; |
1694 | 0 |
|
1695 | 0 | #ifndef NDEBUG |
1696 | 0 | /* We don't emulate user-mode writes to page tables. */ |
1697 | 0 | if ( is_hvm_domain(d) ? hvm_get_cpl(v) == 3 |
1698 | 0 | : !guest_kernel_mode(v, guest_cpu_user_regs()) ) |
1699 | 0 | { |
1700 | 0 | gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached " |
1701 | 0 | "emulate_map_dest(). This should never happen!\n"); |
1702 | 0 | return MAPPING_UNHANDLEABLE; |
1703 | 0 | } |
1704 | 0 | #endif |
1705 | 0 |
|
1706 | 0 | sh_ctxt->mfn[0] = emulate_gva_to_mfn(v, vaddr, sh_ctxt); |
1707 | 0 | if ( !mfn_valid(sh_ctxt->mfn[0]) ) |
1708 | 0 | { |
1709 | 0 | switch ( mfn_x(sh_ctxt->mfn[0]) ) |
1710 | 0 | { |
1711 | 0 | case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION; |
1712 | 0 | case READONLY_GFN: return MAPPING_SILENT_FAIL; |
1713 | 0 | default: return MAPPING_UNHANDLEABLE; |
1714 | 0 | } |
1715 | 0 | } |
1716 | 0 |
|
1717 | 0 | /* Unaligned writes mean probably this isn't a pagetable. */ |
1718 | 0 | if ( vaddr & (bytes - 1) ) |
1719 | 0 | sh_remove_shadows(d, sh_ctxt->mfn[0], 0, 0 /* Slow, can fail. */ ); |
1720 | 0 |
|
1721 | 0 | if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) ) |
1722 | 0 | { |
1723 | 0 | /* Whole write fits on a single page. */ |
1724 | 0 | sh_ctxt->mfn[1] = INVALID_MFN; |
1725 | 0 | map = map_domain_page(sh_ctxt->mfn[0]) + (vaddr & ~PAGE_MASK); |
1726 | 0 | } |
1727 | 0 | else if ( !is_hvm_domain(d) ) |
1728 | 0 | { |
1729 | 0 | /* |
1730 | 0 | * Cross-page emulated writes are only supported for HVM guests; |
1731 | 0 | * PV guests ought to know better. |
1732 | 0 | */ |
1733 | 0 | put_page(mfn_to_page(sh_ctxt->mfn[0])); |
1734 | 0 | return MAPPING_UNHANDLEABLE; |
1735 | 0 | } |
1736 | 0 | else |
1737 | 0 | { |
1738 | 0 | /* This write crosses a page boundary. Translate the second page. */ |
1739 | 0 | sh_ctxt->mfn[1] = emulate_gva_to_mfn( |
1740 | 0 | v, (vaddr + bytes - 1) & PAGE_MASK, sh_ctxt); |
1741 | 0 | if ( !mfn_valid(sh_ctxt->mfn[1]) ) |
1742 | 0 | { |
1743 | 0 | put_page(mfn_to_page(sh_ctxt->mfn[0])); |
1744 | 0 | switch ( mfn_x(sh_ctxt->mfn[1]) ) |
1745 | 0 | { |
1746 | 0 | case BAD_GVA_TO_GFN: return MAPPING_EXCEPTION; |
1747 | 0 | case READONLY_GFN: return MAPPING_SILENT_FAIL; |
1748 | 0 | default: return MAPPING_UNHANDLEABLE; |
1749 | 0 | } |
1750 | 0 | } |
1751 | 0 |
|
1752 | 0 | /* Cross-page writes mean probably not a pagetable. */ |
1753 | 0 | sh_remove_shadows(d, sh_ctxt->mfn[1], 0, 0 /* Slow, can fail. */ ); |
1754 | 0 |
|
1755 | 0 | map = vmap(sh_ctxt->mfn, 2); |
1756 | 0 | if ( !map ) |
1757 | 0 | { |
1758 | 0 | put_page(mfn_to_page(sh_ctxt->mfn[0])); |
1759 | 0 | put_page(mfn_to_page(sh_ctxt->mfn[1])); |
1760 | 0 | return MAPPING_UNHANDLEABLE; |
1761 | 0 | } |
1762 | 0 | map += (vaddr & ~PAGE_MASK); |
1763 | 0 | } |
1764 | 0 |
|
1765 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) |
1766 | 0 | /* |
1767 | 0 | * Remember if the bottom bit was clear, so we can choose not to run |
1768 | 0 | * the change through the verify code if it's still clear afterwards. |
1769 | 0 | */ |
1770 | 0 | sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT); |
1771 | 0 | #endif |
1772 | 0 |
|
1773 | 0 | return map; |
1774 | 0 | } |
1775 | | |
1776 | | /* |
1777 | | * Tidy up after the emulated write: mark pages dirty, verify the new |
1778 | | * contents, and undo the mapping. |
1779 | | */ |
1780 | | void sh_emulate_unmap_dest(struct vcpu *v, void *addr, unsigned int bytes, |
1781 | | struct sh_emulate_ctxt *sh_ctxt) |
1782 | 0 | { |
1783 | 0 | u32 b1 = bytes, b2 = 0, shflags; |
1784 | 0 |
|
1785 | 0 | /* |
1786 | 0 | * We can avoid re-verifying the page contents after the write if: |
1787 | 0 | * - it was no larger than the PTE type of this pagetable; |
1788 | 0 | * - it was aligned to the PTE boundaries; and |
1789 | 0 | * - _PAGE_PRESENT was clear before and after the write. |
1790 | 0 | */ |
1791 | 0 | shflags = mfn_to_page(sh_ctxt->mfn[0])->shadow_flags; |
1792 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) |
1793 | 0 | if ( sh_ctxt->low_bit_was_clear |
1794 | 0 | && !(*(u8 *)addr & _PAGE_PRESENT) |
1795 | 0 | && ((!(shflags & SHF_32) |
1796 | 0 | /* |
1797 | 0 | * Not shadowed 32-bit: aligned 64-bit writes that leave |
1798 | 0 | * the present bit unset are safe to ignore. |
1799 | 0 | */ |
1800 | 0 | && ((unsigned long)addr & 7) == 0 |
1801 | 0 | && bytes <= 8) |
1802 | 0 | || |
1803 | 0 | (!(shflags & (SHF_PAE|SHF_64)) |
1804 | 0 | /* |
1805 | 0 | * Not shadowed PAE/64-bit: aligned 32-bit writes that |
1806 | 0 | * leave the present bit unset are safe to ignore. |
1807 | 0 | */ |
1808 | 0 | && ((unsigned long)addr & 3) == 0 |
1809 | 0 | && bytes <= 4)) ) |
1810 | 0 | { |
1811 | 0 | /* Writes with this alignment constraint can't possibly cross pages. */ |
1812 | 0 | ASSERT(!mfn_valid(sh_ctxt->mfn[1])); |
1813 | 0 | } |
1814 | 0 | else |
1815 | 0 | #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */ |
1816 | 0 | { |
1817 | 0 | if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) ) |
1818 | 0 | { |
1819 | 0 | /* Validate as two writes, one to each page. */ |
1820 | 0 | b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK); |
1821 | 0 | b2 = bytes - b1; |
1822 | 0 | ASSERT(b2 < bytes); |
1823 | 0 | } |
1824 | 0 | if ( likely(b1 > 0) ) |
1825 | 0 | sh_validate_guest_pt_write(v, sh_ctxt->mfn[0], addr, b1); |
1826 | 0 | if ( unlikely(b2 > 0) ) |
1827 | 0 | sh_validate_guest_pt_write(v, sh_ctxt->mfn[1], addr + b1, b2); |
1828 | 0 | } |
1829 | 0 |
|
1830 | 0 | paging_mark_dirty(v->domain, sh_ctxt->mfn[0]); |
1831 | 0 | put_page(mfn_to_page(sh_ctxt->mfn[0])); |
1832 | 0 |
|
1833 | 0 | if ( unlikely(mfn_valid(sh_ctxt->mfn[1])) ) |
1834 | 0 | { |
1835 | 0 | paging_mark_dirty(v->domain, sh_ctxt->mfn[1]); |
1836 | 0 | put_page(mfn_to_page(sh_ctxt->mfn[1])); |
1837 | 0 | vunmap((void *)((unsigned long)addr & PAGE_MASK)); |
1838 | 0 | } |
1839 | 0 | else |
1840 | 0 | unmap_domain_page(addr); |
1841 | 0 |
|
1842 | 0 | atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version); |
1843 | 0 | } |
1844 | | |
1845 | | /**************************************************************************/ |
1846 | | /* Hash table for storing the guest->shadow mappings. |
1847 | | * The table itself is an array of pointers to shadows; the shadows are then |
1848 | | * threaded on a singly-linked list of shadows with the same hash value */ |
1849 | | |
1850 | 0 | #define SHADOW_HASH_BUCKETS 251 |
1851 | | /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */ |
1852 | | |
1853 | | /* Hash function that takes a gfn or mfn, plus another byte of type info */ |
1854 | | typedef u32 key_t; |
1855 | | static inline key_t sh_hash(unsigned long n, unsigned int t) |
1856 | 0 | { |
1857 | 0 | unsigned char *p = (unsigned char *)&n; |
1858 | 0 | key_t k = t; |
1859 | 0 | int i; |
1860 | 0 | for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k; |
1861 | 0 | return k % SHADOW_HASH_BUCKETS; |
1862 | 0 | } |
1863 | | |
1864 | | #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL) |
1865 | | |
1866 | | /* Before we get to the mechanism, define a pair of audit functions |
1867 | | * that sanity-check the contents of the hash table. */ |
1868 | | static void sh_hash_audit_bucket(struct domain *d, int bucket) |
1869 | | /* Audit one bucket of the hash table */ |
1870 | 0 | { |
1871 | 0 | struct page_info *sp, *x; |
1872 | 0 |
|
1873 | 0 | if ( !(SHADOW_AUDIT_ENABLE) ) |
1874 | 0 | return; |
1875 | 0 |
|
1876 | 0 | sp = d->arch.paging.shadow.hash_table[bucket]; |
1877 | 0 | while ( sp ) |
1878 | 0 | { |
1879 | 0 | /* Not a shadow? */ |
1880 | 0 | BUG_ON( (sp->count_info & PGC_count_mask )!= 0 ) ; |
1881 | 0 | /* Bogus type? */ |
1882 | 0 | BUG_ON( sp->u.sh.type == 0 ); |
1883 | 0 | BUG_ON( sp->u.sh.type > SH_type_max_shadow ); |
1884 | 0 | /* Wrong page of a multi-page shadow? */ |
1885 | 0 | BUG_ON( !sp->u.sh.head ); |
1886 | 0 | /* Wrong bucket? */ |
1887 | 0 | BUG_ON( sh_hash(__backpointer(sp), sp->u.sh.type) != bucket ); |
1888 | 0 | /* Duplicate entry? */ |
1889 | 0 | for ( x = next_shadow(sp); x; x = next_shadow(x) ) |
1890 | 0 | BUG_ON( x->v.sh.back == sp->v.sh.back && |
1891 | 0 | x->u.sh.type == sp->u.sh.type ); |
1892 | 0 | /* Follow the backpointer to the guest pagetable */ |
1893 | 0 | if ( sp->u.sh.type != SH_type_fl1_32_shadow |
1894 | 0 | && sp->u.sh.type != SH_type_fl1_pae_shadow |
1895 | 0 | && sp->u.sh.type != SH_type_fl1_64_shadow ) |
1896 | 0 | { |
1897 | 0 | struct page_info *gpg = mfn_to_page(backpointer(sp)); |
1898 | 0 | /* Bad shadow flags on guest page? */ |
1899 | 0 | BUG_ON( !(gpg->shadow_flags & (1<<sp->u.sh.type)) ); |
1900 | 0 | /* Bad type count on guest page? */ |
1901 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
1902 | 0 | if ( sp->u.sh.type == SH_type_l1_32_shadow |
1903 | 0 | || sp->u.sh.type == SH_type_l1_pae_shadow |
1904 | 0 | || sp->u.sh.type == SH_type_l1_64_shadow ) |
1905 | 0 | { |
1906 | 0 | if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page |
1907 | 0 | && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) |
1908 | 0 | { |
1909 | 0 | if ( !page_is_out_of_sync(gpg) ) |
1910 | 0 | { |
1911 | 0 | SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")" |
1912 | 0 | " and not OOS but has typecount %#lx\n", |
1913 | 0 | __backpointer(sp), |
1914 | 0 | mfn_x(page_to_mfn(sp)), |
1915 | 0 | gpg->u.inuse.type_info); |
1916 | 0 | BUG(); |
1917 | 0 | } |
1918 | 0 | } |
1919 | 0 | } |
1920 | 0 | else /* Not an l1 */ |
1921 | 0 | #endif |
1922 | 0 | if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page |
1923 | 0 | && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) |
1924 | 0 | { |
1925 | 0 | SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")" |
1926 | 0 | " but has typecount %#lx\n", |
1927 | 0 | __backpointer(sp), mfn_x(page_to_mfn(sp)), |
1928 | 0 | gpg->u.inuse.type_info); |
1929 | 0 | BUG(); |
1930 | 0 | } |
1931 | 0 | } |
1932 | 0 | /* That entry was OK; on we go */ |
1933 | 0 | sp = next_shadow(sp); |
1934 | 0 | } |
1935 | 0 | } |
1936 | | |
1937 | | #else |
1938 | | #define sh_hash_audit_bucket(_d, _b) do {} while(0) |
1939 | | #endif /* Hashtable bucket audit */ |
1940 | | |
1941 | | |
1942 | | #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL |
1943 | | |
1944 | | static void sh_hash_audit(struct domain *d) |
1945 | | /* Full audit: audit every bucket in the table */ |
1946 | | { |
1947 | | int i; |
1948 | | |
1949 | | if ( !(SHADOW_AUDIT_ENABLE) ) |
1950 | | return; |
1951 | | |
1952 | | for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) |
1953 | | { |
1954 | | sh_hash_audit_bucket(d, i); |
1955 | | } |
1956 | | } |
1957 | | |
1958 | | #else |
1959 | 0 | #define sh_hash_audit(_d) do {} while(0) |
1960 | | #endif /* Hashtable bucket audit */ |
1961 | | |
1962 | | /* Allocate and initialise the table itself. |
1963 | | * Returns 0 for success, 1 for error. */ |
1964 | | static int shadow_hash_alloc(struct domain *d) |
1965 | 0 | { |
1966 | 0 | struct page_info **table; |
1967 | 0 |
|
1968 | 0 | ASSERT(paging_locked_by_me(d)); |
1969 | 0 | ASSERT(!d->arch.paging.shadow.hash_table); |
1970 | 0 |
|
1971 | 0 | table = xzalloc_array(struct page_info *, SHADOW_HASH_BUCKETS); |
1972 | 0 | if ( !table ) return 1; |
1973 | 0 | d->arch.paging.shadow.hash_table = table; |
1974 | 0 | return 0; |
1975 | 0 | } |
1976 | | |
1977 | | /* Tear down the hash table and return all memory to Xen. |
1978 | | * This function does not care whether the table is populated. */ |
1979 | | static void shadow_hash_teardown(struct domain *d) |
1980 | 0 | { |
1981 | 0 | ASSERT(paging_locked_by_me(d)); |
1982 | 0 | ASSERT(d->arch.paging.shadow.hash_table); |
1983 | 0 |
|
1984 | 0 | xfree(d->arch.paging.shadow.hash_table); |
1985 | 0 | d->arch.paging.shadow.hash_table = NULL; |
1986 | 0 | } |
1987 | | |
1988 | | |
1989 | | mfn_t shadow_hash_lookup(struct domain *d, unsigned long n, unsigned int t) |
1990 | | /* Find an entry in the hash table. Returns the MFN of the shadow, |
1991 | | * or INVALID_MFN if it doesn't exist */ |
1992 | 0 | { |
1993 | 0 | struct page_info *sp, *prev; |
1994 | 0 | key_t key; |
1995 | 0 |
|
1996 | 0 | ASSERT(paging_locked_by_me(d)); |
1997 | 0 | ASSERT(d->arch.paging.shadow.hash_table); |
1998 | 0 | ASSERT(t); |
1999 | 0 |
|
2000 | 0 | sh_hash_audit(d); |
2001 | 0 |
|
2002 | 0 | perfc_incr(shadow_hash_lookups); |
2003 | 0 | key = sh_hash(n, t); |
2004 | 0 | sh_hash_audit_bucket(d, key); |
2005 | 0 |
|
2006 | 0 | sp = d->arch.paging.shadow.hash_table[key]; |
2007 | 0 | prev = NULL; |
2008 | 0 | while(sp) |
2009 | 0 | { |
2010 | 0 | if ( __backpointer(sp) == n && sp->u.sh.type == t ) |
2011 | 0 | { |
2012 | 0 | /* Pull-to-front if 'sp' isn't already the head item */ |
2013 | 0 | if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) ) |
2014 | 0 | { |
2015 | 0 | if ( unlikely(d->arch.paging.shadow.hash_walking != 0) ) |
2016 | 0 | /* Can't reorder: someone is walking the hash chains */ |
2017 | 0 | return page_to_mfn(sp); |
2018 | 0 | else |
2019 | 0 | { |
2020 | 0 | ASSERT(prev); |
2021 | 0 | /* Delete sp from the list */ |
2022 | 0 | prev->next_shadow = sp->next_shadow; |
2023 | 0 | /* Re-insert it at the head of the list */ |
2024 | 0 | set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]); |
2025 | 0 | d->arch.paging.shadow.hash_table[key] = sp; |
2026 | 0 | } |
2027 | 0 | } |
2028 | 0 | else |
2029 | 0 | { |
2030 | 0 | perfc_incr(shadow_hash_lookup_head); |
2031 | 0 | } |
2032 | 0 | return page_to_mfn(sp); |
2033 | 0 | } |
2034 | 0 | prev = sp; |
2035 | 0 | sp = next_shadow(sp); |
2036 | 0 | } |
2037 | 0 |
|
2038 | 0 | perfc_incr(shadow_hash_lookup_miss); |
2039 | 0 | return INVALID_MFN; |
2040 | 0 | } |
2041 | | |
2042 | | void shadow_hash_insert(struct domain *d, unsigned long n, unsigned int t, |
2043 | | mfn_t smfn) |
2044 | | /* Put a mapping (n,t)->smfn into the hash table */ |
2045 | 0 | { |
2046 | 0 | struct page_info *sp; |
2047 | 0 | key_t key; |
2048 | 0 |
|
2049 | 0 | ASSERT(paging_locked_by_me(d)); |
2050 | 0 | ASSERT(d->arch.paging.shadow.hash_table); |
2051 | 0 | ASSERT(t); |
2052 | 0 |
|
2053 | 0 | sh_hash_audit(d); |
2054 | 0 |
|
2055 | 0 | perfc_incr(shadow_hash_inserts); |
2056 | 0 | key = sh_hash(n, t); |
2057 | 0 | sh_hash_audit_bucket(d, key); |
2058 | 0 |
|
2059 | 0 | /* Insert this shadow at the top of the bucket */ |
2060 | 0 | sp = mfn_to_page(smfn); |
2061 | 0 | set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]); |
2062 | 0 | d->arch.paging.shadow.hash_table[key] = sp; |
2063 | 0 |
|
2064 | 0 | sh_hash_audit_bucket(d, key); |
2065 | 0 | } |
2066 | | |
2067 | | void shadow_hash_delete(struct domain *d, unsigned long n, unsigned int t, |
2068 | | mfn_t smfn) |
2069 | | /* Excise the mapping (n,t)->smfn from the hash table */ |
2070 | 0 | { |
2071 | 0 | struct page_info *sp, *x; |
2072 | 0 | key_t key; |
2073 | 0 |
|
2074 | 0 | ASSERT(paging_locked_by_me(d)); |
2075 | 0 | ASSERT(d->arch.paging.shadow.hash_table); |
2076 | 0 | ASSERT(t); |
2077 | 0 |
|
2078 | 0 | sh_hash_audit(d); |
2079 | 0 |
|
2080 | 0 | perfc_incr(shadow_hash_deletes); |
2081 | 0 | key = sh_hash(n, t); |
2082 | 0 | sh_hash_audit_bucket(d, key); |
2083 | 0 |
|
2084 | 0 | sp = mfn_to_page(smfn); |
2085 | 0 | if ( d->arch.paging.shadow.hash_table[key] == sp ) |
2086 | 0 | /* Easy case: we're deleting the head item. */ |
2087 | 0 | d->arch.paging.shadow.hash_table[key] = next_shadow(sp); |
2088 | 0 | else |
2089 | 0 | { |
2090 | 0 | /* Need to search for the one we want */ |
2091 | 0 | x = d->arch.paging.shadow.hash_table[key]; |
2092 | 0 | while ( 1 ) |
2093 | 0 | { |
2094 | 0 | ASSERT(x); /* We can't have hit the end, since our target is |
2095 | 0 | * still in the chain somehwere... */ |
2096 | 0 | if ( next_shadow(x) == sp ) |
2097 | 0 | { |
2098 | 0 | x->next_shadow = sp->next_shadow; |
2099 | 0 | break; |
2100 | 0 | } |
2101 | 0 | x = next_shadow(x); |
2102 | 0 | } |
2103 | 0 | } |
2104 | 0 | set_next_shadow(sp, NULL); |
2105 | 0 |
|
2106 | 0 | sh_hash_audit_bucket(d, key); |
2107 | 0 | } |
2108 | | |
2109 | | typedef int (*hash_vcpu_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn); |
2110 | | typedef int (*hash_domain_callback_t)(struct domain *d, mfn_t smfn, mfn_t other_mfn); |
2111 | | |
2112 | | static void hash_vcpu_foreach(struct vcpu *v, unsigned int callback_mask, |
2113 | | const hash_vcpu_callback_t callbacks[], |
2114 | | mfn_t callback_mfn) |
2115 | | /* Walk the hash table looking at the types of the entries and |
2116 | | * calling the appropriate callback function for each entry. |
2117 | | * The mask determines which shadow types we call back for, and the array |
2118 | | * of callbacks tells us which function to call. |
2119 | | * Any callback may return non-zero to let us skip the rest of the scan. |
2120 | | * |
2121 | | * WARNING: Callbacks MUST NOT add or remove hash entries unless they |
2122 | | * then return non-zero to terminate the scan. */ |
2123 | 0 | { |
2124 | 0 | int i, done = 0; |
2125 | 0 | struct domain *d = v->domain; |
2126 | 0 | struct page_info *x; |
2127 | 0 |
|
2128 | 0 | ASSERT(paging_locked_by_me(d)); |
2129 | 0 |
|
2130 | 0 | /* Can be called via p2m code &c after shadow teardown. */ |
2131 | 0 | if ( unlikely(!d->arch.paging.shadow.hash_table) ) |
2132 | 0 | return; |
2133 | 0 |
|
2134 | 0 | /* Say we're here, to stop hash-lookups reordering the chains */ |
2135 | 0 | ASSERT(d->arch.paging.shadow.hash_walking == 0); |
2136 | 0 | d->arch.paging.shadow.hash_walking = 1; |
2137 | 0 |
|
2138 | 0 | for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) |
2139 | 0 | { |
2140 | 0 | /* WARNING: This is not safe against changes to the hash table. |
2141 | 0 | * The callback *must* return non-zero if it has inserted or |
2142 | 0 | * deleted anything from the hash (lookups are OK, though). */ |
2143 | 0 | for ( x = d->arch.paging.shadow.hash_table[i]; x; x = next_shadow(x) ) |
2144 | 0 | { |
2145 | 0 | if ( callback_mask & (1 << x->u.sh.type) ) |
2146 | 0 | { |
2147 | 0 | ASSERT(x->u.sh.type <= 15); |
2148 | 0 | ASSERT(callbacks[x->u.sh.type] != NULL); |
2149 | 0 | done = callbacks[x->u.sh.type](v, page_to_mfn(x), |
2150 | 0 | callback_mfn); |
2151 | 0 | if ( done ) break; |
2152 | 0 | } |
2153 | 0 | } |
2154 | 0 | if ( done ) break; |
2155 | 0 | } |
2156 | 0 | d->arch.paging.shadow.hash_walking = 0; |
2157 | 0 | } |
2158 | | |
2159 | | static void hash_domain_foreach(struct domain *d, |
2160 | | unsigned int callback_mask, |
2161 | | const hash_domain_callback_t callbacks[], |
2162 | | mfn_t callback_mfn) |
2163 | | /* Walk the hash table looking at the types of the entries and |
2164 | | * calling the appropriate callback function for each entry. |
2165 | | * The mask determines which shadow types we call back for, and the array |
2166 | | * of callbacks tells us which function to call. |
2167 | | * Any callback may return non-zero to let us skip the rest of the scan. |
2168 | | * |
2169 | | * WARNING: Callbacks MUST NOT add or remove hash entries unless they |
2170 | | * then return non-zero to terminate the scan. */ |
2171 | 0 | { |
2172 | 0 | int i, done = 0; |
2173 | 0 | struct page_info *x; |
2174 | 0 |
|
2175 | 0 | ASSERT(paging_locked_by_me(d)); |
2176 | 0 |
|
2177 | 0 | /* Can be called via p2m code &c after shadow teardown. */ |
2178 | 0 | if ( unlikely(!d->arch.paging.shadow.hash_table) ) |
2179 | 0 | return; |
2180 | 0 |
|
2181 | 0 | /* Say we're here, to stop hash-lookups reordering the chains */ |
2182 | 0 | ASSERT(d->arch.paging.shadow.hash_walking == 0); |
2183 | 0 | d->arch.paging.shadow.hash_walking = 1; |
2184 | 0 |
|
2185 | 0 | for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) |
2186 | 0 | { |
2187 | 0 | /* WARNING: This is not safe against changes to the hash table. |
2188 | 0 | * The callback *must* return non-zero if it has inserted or |
2189 | 0 | * deleted anything from the hash (lookups are OK, though). */ |
2190 | 0 | for ( x = d->arch.paging.shadow.hash_table[i]; x; x = next_shadow(x) ) |
2191 | 0 | { |
2192 | 0 | if ( callback_mask & (1 << x->u.sh.type) ) |
2193 | 0 | { |
2194 | 0 | ASSERT(x->u.sh.type <= 15); |
2195 | 0 | ASSERT(callbacks[x->u.sh.type] != NULL); |
2196 | 0 | done = callbacks[x->u.sh.type](d, page_to_mfn(x), |
2197 | 0 | callback_mfn); |
2198 | 0 | if ( done ) break; |
2199 | 0 | } |
2200 | 0 | } |
2201 | 0 | if ( done ) break; |
2202 | 0 | } |
2203 | 0 | d->arch.paging.shadow.hash_walking = 0; |
2204 | 0 | } |
2205 | | |
2206 | | |
2207 | | /**************************************************************************/ |
2208 | | /* Destroy a shadow page: simple dispatcher to call the per-type destructor |
2209 | | * which will decrement refcounts appropriately and return memory to the |
2210 | | * free pool. */ |
2211 | | |
2212 | | void sh_destroy_shadow(struct domain *d, mfn_t smfn) |
2213 | 0 | { |
2214 | 0 | struct page_info *sp = mfn_to_page(smfn); |
2215 | 0 | unsigned int t = sp->u.sh.type; |
2216 | 0 |
|
2217 | 0 |
|
2218 | 0 | SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn)); |
2219 | 0 |
|
2220 | 0 | /* Double-check, if we can, that the shadowed page belongs to this |
2221 | 0 | * domain, (by following the back-pointer). */ |
2222 | 0 | ASSERT(t == SH_type_fl1_32_shadow || |
2223 | 0 | t == SH_type_fl1_pae_shadow || |
2224 | 0 | t == SH_type_fl1_64_shadow || |
2225 | 0 | t == SH_type_monitor_table || |
2226 | 0 | (is_pv_32bit_domain(d) && t == SH_type_l4_64_shadow) || |
2227 | 0 | (page_get_owner(mfn_to_page(backpointer(sp))) == d)); |
2228 | 0 |
|
2229 | 0 | /* The down-shifts here are so that the switch statement is on nice |
2230 | 0 | * small numbers that the compiler will enjoy */ |
2231 | 0 | switch ( t ) |
2232 | 0 | { |
2233 | 0 | case SH_type_l1_32_shadow: |
2234 | 0 | case SH_type_fl1_32_shadow: |
2235 | 0 | SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(d, smfn); |
2236 | 0 | break; |
2237 | 0 | case SH_type_l2_32_shadow: |
2238 | 0 | SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(d, smfn); |
2239 | 0 | break; |
2240 | 0 |
|
2241 | 0 | case SH_type_l1_pae_shadow: |
2242 | 0 | case SH_type_fl1_pae_shadow: |
2243 | 0 | SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(d, smfn); |
2244 | 0 | break; |
2245 | 0 | case SH_type_l2_pae_shadow: |
2246 | 0 | case SH_type_l2h_pae_shadow: |
2247 | 0 | SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(d, smfn); |
2248 | 0 | break; |
2249 | 0 |
|
2250 | 0 | case SH_type_l1_64_shadow: |
2251 | 0 | case SH_type_fl1_64_shadow: |
2252 | 0 | SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(d, smfn); |
2253 | 0 | break; |
2254 | 0 | case SH_type_l2h_64_shadow: |
2255 | 0 | ASSERT(is_pv_32bit_domain(d)); |
2256 | 0 | /* Fall through... */ |
2257 | 0 | case SH_type_l2_64_shadow: |
2258 | 0 | SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(d, smfn); |
2259 | 0 | break; |
2260 | 0 | case SH_type_l3_64_shadow: |
2261 | 0 | SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(d, smfn); |
2262 | 0 | break; |
2263 | 0 | case SH_type_l4_64_shadow: |
2264 | 0 | SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(d, smfn); |
2265 | 0 | break; |
2266 | 0 |
|
2267 | 0 | default: |
2268 | 0 | SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n", |
2269 | 0 | (unsigned long)t); |
2270 | 0 | BUG(); |
2271 | 0 | } |
2272 | 0 | } |
2273 | | |
2274 | | static inline void trace_shadow_wrmap_bf(mfn_t gmfn) |
2275 | 0 | { |
2276 | 0 | if ( tb_init_done ) |
2277 | 0 | { |
2278 | 0 | /* Convert gmfn to gfn */ |
2279 | 0 | unsigned long gfn = mfn_to_gfn(current->domain, gmfn); |
2280 | 0 | __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), &gfn); |
2281 | 0 | } |
2282 | 0 | } |
2283 | | |
2284 | | /**************************************************************************/ |
2285 | | /* Remove all writeable mappings of a guest frame from the shadow tables |
2286 | | * Returns non-zero if we need to flush TLBs. |
2287 | | * level and fault_addr desribe how we found this to be a pagetable; |
2288 | | * level==0 means we have some other reason for revoking write access. |
2289 | | * If level==0 we are allowed to fail, returning -1. */ |
2290 | | |
2291 | | int sh_remove_write_access(struct domain *d, mfn_t gmfn, |
2292 | | unsigned int level, |
2293 | | unsigned long fault_addr) |
2294 | 0 | { |
2295 | 0 | /* Dispatch table for getting per-type functions */ |
2296 | 0 | static const hash_domain_callback_t callbacks[SH_type_unused] = { |
2297 | 0 | NULL, /* none */ |
2298 | 0 | SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32 */ |
2299 | 0 | SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32 */ |
2300 | 0 | NULL, /* l2_32 */ |
2301 | 0 | SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* l1_pae */ |
2302 | 0 | SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* fl1_pae */ |
2303 | 0 | NULL, /* l2_pae */ |
2304 | 0 | NULL, /* l2h_pae */ |
2305 | 0 | SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* l1_64 */ |
2306 | 0 | SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* fl1_64 */ |
2307 | 0 | NULL, /* l2_64 */ |
2308 | 0 | NULL, /* l2h_64 */ |
2309 | 0 | NULL, /* l3_64 */ |
2310 | 0 | NULL, /* l4_64 */ |
2311 | 0 | NULL, /* p2m */ |
2312 | 0 | NULL /* unused */ |
2313 | 0 | }; |
2314 | 0 |
|
2315 | 0 | static const unsigned int callback_mask = |
2316 | 0 | SHF_L1_32 |
2317 | 0 | | SHF_FL1_32 |
2318 | 0 | | SHF_L1_PAE |
2319 | 0 | | SHF_FL1_PAE |
2320 | 0 | | SHF_L1_64 |
2321 | 0 | | SHF_FL1_64 |
2322 | 0 | ; |
2323 | 0 | struct page_info *pg = mfn_to_page(gmfn); |
2324 | 0 | #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC |
2325 | 0 | struct vcpu *curr = current; |
2326 | 0 | #endif |
2327 | 0 |
|
2328 | 0 | ASSERT(paging_locked_by_me(d)); |
2329 | 0 |
|
2330 | 0 | /* Only remove writable mappings if we are doing shadow refcounts. |
2331 | 0 | * In guest refcounting, we trust Xen to already be restricting |
2332 | 0 | * all the writes to the guest page tables, so we do not need to |
2333 | 0 | * do more. */ |
2334 | 0 | if ( !shadow_mode_refcounts(d) ) |
2335 | 0 | return 0; |
2336 | 0 |
|
2337 | 0 | /* Early exit if it's already a pagetable, or otherwise not writeable */ |
2338 | 0 | if ( (sh_mfn_is_a_page_table(gmfn) |
2339 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
2340 | 0 | /* Unless they've been allowed to go out of sync with their shadows */ |
2341 | 0 | && !mfn_oos_may_write(gmfn) |
2342 | 0 | #endif |
2343 | 0 | ) |
2344 | 0 | || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) |
2345 | 0 | return 0; |
2346 | 0 |
|
2347 | 0 | TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP); |
2348 | 0 |
|
2349 | 0 | perfc_incr(shadow_writeable); |
2350 | 0 |
|
2351 | 0 | /* If this isn't a "normal" writeable page, the domain is trying to |
2352 | 0 | * put pagetables in special memory of some kind. We can't allow that. */ |
2353 | 0 | if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page ) |
2354 | 0 | { |
2355 | 0 | SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %" |
2356 | 0 | PRtype_info "\n", |
2357 | 0 | mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info); |
2358 | 0 | domain_crash(d); |
2359 | 0 | } |
2360 | 0 |
|
2361 | 0 | #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC |
2362 | 0 | if ( curr->domain == d ) |
2363 | 0 | { |
2364 | 0 | unsigned long gfn; |
2365 | 0 | /* Heuristic: there is likely to be only one writeable mapping, |
2366 | 0 | * and that mapping is likely to be in the current pagetable, |
2367 | 0 | * in the guest's linear map (on non-HIGHPTE linux and windows)*/ |
2368 | 0 |
|
2369 | 0 | #define GUESS(_a, _h) do { \ |
2370 | 0 | if ( curr->arch.paging.mode->shadow.guess_wrmap( \ |
2371 | 0 | curr, (_a), gmfn) ) \ |
2372 | 0 | perfc_incr(shadow_writeable_h_ ## _h); \ |
2373 | 0 | if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \ |
2374 | 0 | { \ |
2375 | 0 | TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); \ |
2376 | 0 | return 1; \ |
2377 | 0 | } \ |
2378 | 0 | } while (0) |
2379 | 0 |
|
2380 | 0 | if ( curr->arch.paging.mode->guest_levels == 2 ) |
2381 | 0 | { |
2382 | 0 | if ( level == 1 ) |
2383 | 0 | /* 32bit non-PAE w2k3: linear map at 0xC0000000 */ |
2384 | 0 | GUESS(0xC0000000UL + (fault_addr >> 10), 1); |
2385 | 0 |
|
2386 | 0 | /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */ |
2387 | 0 | if ((gfn = mfn_to_gfn(d, gmfn)) < 0x38000 ) |
2388 | 0 | GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4); |
2389 | 0 |
|
2390 | 0 | /* FreeBSD: Linear map at 0xBFC00000 */ |
2391 | 0 | if ( level == 1 ) |
2392 | 0 | GUESS(0xBFC00000UL |
2393 | 0 | + ((fault_addr & VADDR_MASK) >> 10), 6); |
2394 | 0 | } |
2395 | 0 | else if ( curr->arch.paging.mode->guest_levels == 3 ) |
2396 | 0 | { |
2397 | 0 | /* 32bit PAE w2k3: linear map at 0xC0000000 */ |
2398 | 0 | switch ( level ) |
2399 | 0 | { |
2400 | 0 | case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break; |
2401 | 0 | case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break; |
2402 | 0 | } |
2403 | 0 |
|
2404 | 0 | /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */ |
2405 | 0 | if ((gfn = mfn_to_gfn(d, gmfn)) < 0x38000 ) |
2406 | 0 | GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4); |
2407 | 0 |
|
2408 | 0 | /* FreeBSD PAE: Linear map at 0xBF800000 */ |
2409 | 0 | switch ( level ) |
2410 | 0 | { |
2411 | 0 | case 1: GUESS(0xBF800000UL |
2412 | 0 | + ((fault_addr & VADDR_MASK) >> 9), 6); break; |
2413 | 0 | case 2: GUESS(0xBFDFC000UL |
2414 | 0 | + ((fault_addr & VADDR_MASK) >> 18), 6); break; |
2415 | 0 | } |
2416 | 0 | } |
2417 | 0 | else if ( curr->arch.paging.mode->guest_levels == 4 ) |
2418 | 0 | { |
2419 | 0 | /* 64bit w2k3: linear map at 0xfffff68000000000 */ |
2420 | 0 | switch ( level ) |
2421 | 0 | { |
2422 | 0 | case 1: GUESS(0xfffff68000000000UL |
2423 | 0 | + ((fault_addr & VADDR_MASK) >> 9), 3); break; |
2424 | 0 | case 2: GUESS(0xfffff6fb40000000UL |
2425 | 0 | + ((fault_addr & VADDR_MASK) >> 18), 3); break; |
2426 | 0 | case 3: GUESS(0xfffff6fb7da00000UL |
2427 | 0 | + ((fault_addr & VADDR_MASK) >> 27), 3); break; |
2428 | 0 | } |
2429 | 0 |
|
2430 | 0 | /* 64bit Linux direct map at 0xffff880000000000; older kernels |
2431 | 0 | * had it at 0xffff810000000000, and older kernels yet had it |
2432 | 0 | * at 0x0000010000000000UL */ |
2433 | 0 | gfn = mfn_to_gfn(d, gmfn); |
2434 | 0 | GUESS(0xffff880000000000UL + (gfn << PAGE_SHIFT), 4); |
2435 | 0 | GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4); |
2436 | 0 | GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4); |
2437 | 0 |
|
2438 | 0 | /* |
2439 | 0 | * 64bit Solaris kernel page map at |
2440 | 0 | * kpm_vbase; 0xfffffe0000000000UL |
2441 | 0 | */ |
2442 | 0 | GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4); |
2443 | 0 |
|
2444 | 0 | /* FreeBSD 64bit: linear map 0xffff800000000000 */ |
2445 | 0 | switch ( level ) |
2446 | 0 | { |
2447 | 0 | case 1: GUESS(0xffff800000000000 |
2448 | 0 | + ((fault_addr & VADDR_MASK) >> 9), 6); break; |
2449 | 0 | case 2: GUESS(0xffff804000000000UL |
2450 | 0 | + ((fault_addr & VADDR_MASK) >> 18), 6); break; |
2451 | 0 | case 3: GUESS(0xffff804020000000UL |
2452 | 0 | + ((fault_addr & VADDR_MASK) >> 27), 6); break; |
2453 | 0 | } |
2454 | 0 | /* FreeBSD 64bit: direct map at 0xffffff0000000000 */ |
2455 | 0 | GUESS(0xffffff0000000000 + (gfn << PAGE_SHIFT), 6); |
2456 | 0 | } |
2457 | 0 |
|
2458 | 0 | #undef GUESS |
2459 | 0 | } |
2460 | 0 |
|
2461 | 0 | if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) |
2462 | 0 | return 1; |
2463 | 0 |
|
2464 | 0 | /* Second heuristic: on HIGHPTE linux, there are two particular PTEs |
2465 | 0 | * (entries in the fixmap) where linux maps its pagetables. Since |
2466 | 0 | * we expect to hit them most of the time, we start the search for |
2467 | 0 | * the writeable mapping by looking at the same MFN where the last |
2468 | 0 | * brute-force search succeeded. */ |
2469 | 0 |
|
2470 | 0 | if ( (curr->domain == d) && |
2471 | 0 | (curr->arch.paging.shadow.last_writeable_pte_smfn != 0) ) |
2472 | 0 | { |
2473 | 0 | unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask); |
2474 | 0 | mfn_t last_smfn = _mfn(curr->arch.paging.shadow.last_writeable_pte_smfn); |
2475 | 0 | int shtype = mfn_to_page(last_smfn)->u.sh.type; |
2476 | 0 |
|
2477 | 0 | if ( callbacks[shtype] ) |
2478 | 0 | callbacks[shtype](d, last_smfn, gmfn); |
2479 | 0 |
|
2480 | 0 | if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count ) |
2481 | 0 | perfc_incr(shadow_writeable_h_5); |
2482 | 0 | } |
2483 | 0 |
|
2484 | 0 | if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) |
2485 | 0 | return 1; |
2486 | 0 |
|
2487 | 0 | #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */ |
2488 | 0 |
|
2489 | 0 | /* Brute-force search of all the shadows, by walking the hash */ |
2490 | 0 | trace_shadow_wrmap_bf(gmfn); |
2491 | 0 | if ( level == 0 ) |
2492 | 0 | perfc_incr(shadow_writeable_bf_1); |
2493 | 0 | else |
2494 | 0 | perfc_incr(shadow_writeable_bf); |
2495 | 0 | hash_domain_foreach(d, callback_mask, callbacks, gmfn); |
2496 | 0 |
|
2497 | 0 | /* If that didn't catch the mapping, then there's some non-pagetable |
2498 | 0 | * mapping -- ioreq page, grant mapping, &c. */ |
2499 | 0 | if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) |
2500 | 0 | { |
2501 | 0 | if ( level == 0 ) |
2502 | 0 | return -1; |
2503 | 0 |
|
2504 | 0 | SHADOW_ERROR("can't remove write access to mfn %lx: guest has " |
2505 | 0 | "%lu special-use mappings of it\n", mfn_x(gmfn), |
2506 | 0 | (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); |
2507 | 0 | domain_crash(d); |
2508 | 0 | } |
2509 | 0 |
|
2510 | 0 | /* We killed at least one writeable mapping, so must flush TLBs. */ |
2511 | 0 | return 1; |
2512 | 0 | } |
2513 | | |
2514 | | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
2515 | | int sh_remove_write_access_from_sl1p(struct domain *d, mfn_t gmfn, |
2516 | | mfn_t smfn, unsigned long off) |
2517 | 0 | { |
2518 | 0 | struct page_info *sp = mfn_to_page(smfn); |
2519 | 0 |
|
2520 | 0 | ASSERT(mfn_valid(smfn)); |
2521 | 0 | ASSERT(mfn_valid(gmfn)); |
2522 | 0 |
|
2523 | 0 | if ( sp->u.sh.type == SH_type_l1_32_shadow |
2524 | 0 | || sp->u.sh.type == SH_type_fl1_32_shadow ) |
2525 | 0 | { |
2526 | 0 | return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2) |
2527 | 0 | (d, gmfn, smfn, off); |
2528 | 0 | } |
2529 | 0 | else if ( sp->u.sh.type == SH_type_l1_pae_shadow |
2530 | 0 | || sp->u.sh.type == SH_type_fl1_pae_shadow ) |
2531 | 0 | return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3) |
2532 | 0 | (d, gmfn, smfn, off); |
2533 | 0 | else if ( sp->u.sh.type == SH_type_l1_64_shadow |
2534 | 0 | || sp->u.sh.type == SH_type_fl1_64_shadow ) |
2535 | 0 | return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4) |
2536 | 0 | (d, gmfn, smfn, off); |
2537 | 0 |
|
2538 | 0 | return 0; |
2539 | 0 | } |
2540 | | #endif |
2541 | | |
2542 | | /**************************************************************************/ |
2543 | | /* Remove all mappings of a guest frame from the shadow tables. |
2544 | | * Returns non-zero if we need to flush TLBs. */ |
2545 | | |
2546 | | static int sh_remove_all_mappings(struct domain *d, mfn_t gmfn, gfn_t gfn) |
2547 | 0 | { |
2548 | 0 | struct page_info *page = mfn_to_page(gmfn); |
2549 | 0 |
|
2550 | 0 | /* Dispatch table for getting per-type functions */ |
2551 | 0 | static const hash_domain_callback_t callbacks[SH_type_unused] = { |
2552 | 0 | NULL, /* none */ |
2553 | 0 | SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32 */ |
2554 | 0 | SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32 */ |
2555 | 0 | NULL, /* l2_32 */ |
2556 | 0 | SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* l1_pae */ |
2557 | 0 | SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* fl1_pae */ |
2558 | 0 | NULL, /* l2_pae */ |
2559 | 0 | NULL, /* l2h_pae */ |
2560 | 0 | SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* l1_64 */ |
2561 | 0 | SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* fl1_64 */ |
2562 | 0 | NULL, /* l2_64 */ |
2563 | 0 | NULL, /* l2h_64 */ |
2564 | 0 | NULL, /* l3_64 */ |
2565 | 0 | NULL, /* l4_64 */ |
2566 | 0 | NULL, /* p2m */ |
2567 | 0 | NULL /* unused */ |
2568 | 0 | }; |
2569 | 0 |
|
2570 | 0 | static const unsigned int callback_mask = |
2571 | 0 | SHF_L1_32 |
2572 | 0 | | SHF_FL1_32 |
2573 | 0 | | SHF_L1_PAE |
2574 | 0 | | SHF_FL1_PAE |
2575 | 0 | | SHF_L1_64 |
2576 | 0 | | SHF_FL1_64 |
2577 | 0 | ; |
2578 | 0 |
|
2579 | 0 | perfc_incr(shadow_mappings); |
2580 | 0 | if ( sh_check_page_has_no_refs(page) ) |
2581 | 0 | return 0; |
2582 | 0 |
|
2583 | 0 | /* Although this is an externally visible function, we do not know |
2584 | 0 | * whether the paging lock will be held when it is called (since it |
2585 | 0 | * can be called via put_page_type when we clear a shadow l1e).*/ |
2586 | 0 | paging_lock_recursive(d); |
2587 | 0 |
|
2588 | 0 | /* XXX TODO: |
2589 | 0 | * Heuristics for finding the (probably) single mapping of this gmfn */ |
2590 | 0 |
|
2591 | 0 | /* Brute-force search of all the shadows, by walking the hash */ |
2592 | 0 | perfc_incr(shadow_mappings_bf); |
2593 | 0 | hash_domain_foreach(d, callback_mask, callbacks, gmfn); |
2594 | 0 |
|
2595 | 0 | /* If that didn't catch the mapping, something is very wrong */ |
2596 | 0 | if ( !sh_check_page_has_no_refs(page) ) |
2597 | 0 | { |
2598 | 0 | /* |
2599 | 0 | * Don't complain if we're in HVM and there are some extra mappings: |
2600 | 0 | * The qemu helper process has an untyped mapping of this dom's RAM |
2601 | 0 | * and the HVM restore program takes another. |
2602 | 0 | * Also allow one typed refcount for |
2603 | 0 | * - Xen heap pages, to match share_xen_page_with_guest(), |
2604 | 0 | * - ioreq server pages, to match prepare_ring_for_helper(). |
2605 | 0 | */ |
2606 | 0 | if ( !(shadow_mode_external(d) |
2607 | 0 | && (page->count_info & PGC_count_mask) <= 3 |
2608 | 0 | && ((page->u.inuse.type_info & PGT_count_mask) |
2609 | 0 | == (is_xen_heap_page(page) || |
2610 | 0 | is_ioreq_server_page(d, page)))) ) |
2611 | 0 | { |
2612 | 0 | SHADOW_ERROR("can't find all mappings of mfn %lx (gfn %lx): " |
2613 | 0 | "c=%lx t=%lx x=%d i=%d\n", mfn_x(gmfn), gfn_x(gfn), |
2614 | 0 | page->count_info, page->u.inuse.type_info, |
2615 | 0 | !!is_xen_heap_page(page), is_ioreq_server_page(d, page)); |
2616 | 0 | } |
2617 | 0 | } |
2618 | 0 |
|
2619 | 0 | paging_unlock(d); |
2620 | 0 |
|
2621 | 0 | /* We killed at least one mapping, so must flush TLBs. */ |
2622 | 0 | return 1; |
2623 | 0 | } |
2624 | | |
2625 | | |
2626 | | /**************************************************************************/ |
2627 | | /* Remove all shadows of a guest frame from the shadow tables */ |
2628 | | |
2629 | | static int sh_remove_shadow_via_pointer(struct domain *d, mfn_t smfn) |
2630 | | /* Follow this shadow's up-pointer, if it has one, and remove the reference |
2631 | | * found there. Returns 1 if that was the only reference to this shadow */ |
2632 | 0 | { |
2633 | 0 | struct page_info *sp = mfn_to_page(smfn); |
2634 | 0 | mfn_t pmfn; |
2635 | 0 | void *vaddr; |
2636 | 0 | int rc; |
2637 | 0 |
|
2638 | 0 | ASSERT(sp->u.sh.type > 0); |
2639 | 0 | ASSERT(sp->u.sh.type < SH_type_max_shadow); |
2640 | 0 | ASSERT(sh_type_has_up_pointer(d, sp->u.sh.type)); |
2641 | 0 |
|
2642 | 0 | if (sp->up == 0) return 0; |
2643 | 0 | pmfn = maddr_to_mfn(sp->up); |
2644 | 0 | ASSERT(mfn_valid(pmfn)); |
2645 | 0 | vaddr = map_domain_page(pmfn); |
2646 | 0 | ASSERT(vaddr); |
2647 | 0 | vaddr += sp->up & (PAGE_SIZE-1); |
2648 | 0 | ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn)); |
2649 | 0 |
|
2650 | 0 | /* Is this the only reference to this shadow? */ |
2651 | 0 | rc = (sp->u.sh.count == 1) ? 1 : 0; |
2652 | 0 |
|
2653 | 0 | /* Blank the offending entry */ |
2654 | 0 | switch (sp->u.sh.type) |
2655 | 0 | { |
2656 | 0 | case SH_type_l1_32_shadow: |
2657 | 0 | case SH_type_l2_32_shadow: |
2658 | 0 | SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(d, vaddr, pmfn); |
2659 | 0 | break; |
2660 | 0 | case SH_type_l1_pae_shadow: |
2661 | 0 | case SH_type_l2_pae_shadow: |
2662 | 0 | case SH_type_l2h_pae_shadow: |
2663 | 0 | SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(d, vaddr, pmfn); |
2664 | 0 | break; |
2665 | 0 | case SH_type_l1_64_shadow: |
2666 | 0 | case SH_type_l2_64_shadow: |
2667 | 0 | case SH_type_l2h_64_shadow: |
2668 | 0 | case SH_type_l3_64_shadow: |
2669 | 0 | case SH_type_l4_64_shadow: |
2670 | 0 | SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(d, vaddr, pmfn); |
2671 | 0 | break; |
2672 | 0 | default: BUG(); /* Some wierd unknown shadow type */ |
2673 | 0 | } |
2674 | 0 |
|
2675 | 0 | unmap_domain_page(vaddr); |
2676 | 0 | if ( rc ) |
2677 | 0 | perfc_incr(shadow_up_pointer); |
2678 | 0 | else |
2679 | 0 | perfc_incr(shadow_unshadow_bf); |
2680 | 0 |
|
2681 | 0 | return rc; |
2682 | 0 | } |
2683 | | |
2684 | | void sh_remove_shadows(struct domain *d, mfn_t gmfn, int fast, int all) |
2685 | | /* Remove the shadows of this guest page. |
2686 | | * If fast != 0, just try the quick heuristic, which will remove |
2687 | | * at most one reference to each shadow of the page. Otherwise, walk |
2688 | | * all the shadow tables looking for refs to shadows of this gmfn. |
2689 | | * If all != 0, kill the domain if we can't find all the shadows. |
2690 | | * (all != 0 implies fast == 0) |
2691 | | */ |
2692 | 0 | { |
2693 | 0 | struct page_info *pg = mfn_to_page(gmfn); |
2694 | 0 | mfn_t smfn; |
2695 | 0 | unsigned char t; |
2696 | 0 |
|
2697 | 0 | /* Dispatch table for getting per-type functions: each level must |
2698 | 0 | * be called with the function to remove a lower-level shadow. */ |
2699 | 0 | static const hash_domain_callback_t callbacks[SH_type_unused] = { |
2700 | 0 | NULL, /* none */ |
2701 | 0 | NULL, /* l1_32 */ |
2702 | 0 | NULL, /* fl1_32 */ |
2703 | 0 | SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 2), /* l2_32 */ |
2704 | 0 | NULL, /* l1_pae */ |
2705 | 0 | NULL, /* fl1_pae */ |
2706 | 0 | SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2_pae */ |
2707 | 0 | SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2h_pae */ |
2708 | 0 | NULL, /* l1_64 */ |
2709 | 0 | NULL, /* fl1_64 */ |
2710 | 0 | SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2_64 */ |
2711 | 0 | SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2h_64 */ |
2712 | 0 | SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, 4), /* l3_64 */ |
2713 | 0 | SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, 4), /* l4_64 */ |
2714 | 0 | NULL, /* p2m */ |
2715 | 0 | NULL /* unused */ |
2716 | 0 | }; |
2717 | 0 |
|
2718 | 0 | /* Another lookup table, for choosing which mask to use */ |
2719 | 0 | static const unsigned int masks[SH_type_unused] = { |
2720 | 0 | 0, /* none */ |
2721 | 0 | SHF_L2_32, /* l1_32 */ |
2722 | 0 | 0, /* fl1_32 */ |
2723 | 0 | 0, /* l2_32 */ |
2724 | 0 | SHF_L2H_PAE | SHF_L2_PAE, /* l1_pae */ |
2725 | 0 | 0, /* fl1_pae */ |
2726 | 0 | 0, /* l2_pae */ |
2727 | 0 | 0, /* l2h_pae */ |
2728 | 0 | SHF_L2H_64 | SHF_L2_64, /* l1_64 */ |
2729 | 0 | 0, /* fl1_64 */ |
2730 | 0 | SHF_L3_64, /* l2_64 */ |
2731 | 0 | SHF_L3_64, /* l2h_64 */ |
2732 | 0 | SHF_L4_64, /* l3_64 */ |
2733 | 0 | 0, /* l4_64 */ |
2734 | 0 | 0, /* p2m */ |
2735 | 0 | 0 /* unused */ |
2736 | 0 | }; |
2737 | 0 |
|
2738 | 0 | ASSERT(!(all && fast)); |
2739 | 0 | ASSERT(mfn_valid(gmfn)); |
2740 | 0 |
|
2741 | 0 | /* Although this is an externally visible function, we do not know |
2742 | 0 | * whether the paging lock will be held when it is called (since it |
2743 | 0 | * can be called via put_page_type when we clear a shadow l1e).*/ |
2744 | 0 | paging_lock_recursive(d); |
2745 | 0 |
|
2746 | 0 | SHADOW_PRINTK("d%d gmfn=%"PRI_mfn"\n", d->domain_id, mfn_x(gmfn)); |
2747 | 0 |
|
2748 | 0 | /* Bail out now if the page is not shadowed */ |
2749 | 0 | if ( (pg->count_info & PGC_page_table) == 0 ) |
2750 | 0 | { |
2751 | 0 | paging_unlock(d); |
2752 | 0 | return; |
2753 | 0 | } |
2754 | 0 |
|
2755 | 0 | /* Search for this shadow in all appropriate shadows */ |
2756 | 0 | perfc_incr(shadow_unshadow); |
2757 | 0 |
|
2758 | 0 | /* Lower-level shadows need to be excised from upper-level shadows. |
2759 | 0 | * This call to hash_vcpu_foreach() looks dangerous but is in fact OK: each |
2760 | 0 | * call will remove at most one shadow, and terminate immediately when |
2761 | 0 | * it does remove it, so we never walk the hash after doing a deletion. */ |
2762 | 0 | #define DO_UNSHADOW(_type) do { \ |
2763 | 0 | t = (_type); \ |
2764 | 0 | if( !(pg->count_info & PGC_page_table) \ |
2765 | 0 | || !(pg->shadow_flags & (1 << t)) ) \ |
2766 | 0 | break; \ |
2767 | 0 | smfn = shadow_hash_lookup(d, mfn_x(gmfn), t); \ |
2768 | 0 | if ( unlikely(!mfn_valid(smfn)) ) \ |
2769 | 0 | { \ |
2770 | 0 | SHADOW_ERROR(": gmfn %#lx has flags %#"PRIx32 \ |
2771 | 0 | " but no type-%#"PRIx32" shadow\n", \ |
2772 | 0 | mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \ |
2773 | 0 | break; \ |
2774 | 0 | } \ |
2775 | 0 | if ( sh_type_is_pinnable(d, t) ) \ |
2776 | 0 | sh_unpin(d, smfn); \ |
2777 | 0 | else if ( sh_type_has_up_pointer(d, t) ) \ |
2778 | 0 | sh_remove_shadow_via_pointer(d, smfn); \ |
2779 | 0 | if( !fast \ |
2780 | 0 | && (pg->count_info & PGC_page_table) \ |
2781 | 0 | && (pg->shadow_flags & (1 << t)) ) \ |
2782 | 0 | hash_domain_foreach(d, masks[t], callbacks, smfn); \ |
2783 | 0 | } while (0) |
2784 | 0 |
|
2785 | 0 | DO_UNSHADOW(SH_type_l2_32_shadow); |
2786 | 0 | DO_UNSHADOW(SH_type_l1_32_shadow); |
2787 | 0 | DO_UNSHADOW(SH_type_l2h_pae_shadow); |
2788 | 0 | DO_UNSHADOW(SH_type_l2_pae_shadow); |
2789 | 0 | DO_UNSHADOW(SH_type_l1_pae_shadow); |
2790 | 0 | DO_UNSHADOW(SH_type_l4_64_shadow); |
2791 | 0 | DO_UNSHADOW(SH_type_l3_64_shadow); |
2792 | 0 | DO_UNSHADOW(SH_type_l2h_64_shadow); |
2793 | 0 | DO_UNSHADOW(SH_type_l2_64_shadow); |
2794 | 0 | DO_UNSHADOW(SH_type_l1_64_shadow); |
2795 | 0 |
|
2796 | 0 | #undef DO_UNSHADOW |
2797 | 0 |
|
2798 | 0 | /* If that didn't catch the shadows, something is wrong */ |
2799 | 0 | if ( !fast && all && (pg->count_info & PGC_page_table) ) |
2800 | 0 | { |
2801 | 0 | SHADOW_ERROR("can't find all shadows of mfn %"PRI_mfn" " |
2802 | 0 | "(shadow_flags=%08x)\n", |
2803 | 0 | mfn_x(gmfn), pg->shadow_flags); |
2804 | 0 | domain_crash(d); |
2805 | 0 | } |
2806 | 0 |
|
2807 | 0 | /* Need to flush TLBs now, so that linear maps are safe next time we |
2808 | 0 | * take a fault. */ |
2809 | 0 | flush_tlb_mask(d->domain_dirty_cpumask); |
2810 | 0 |
|
2811 | 0 | paging_unlock(d); |
2812 | 0 | } |
2813 | | |
2814 | | static void |
2815 | | sh_remove_all_shadows_and_parents(struct domain *d, mfn_t gmfn) |
2816 | | /* Even harsher: this is a HVM page that we thing is no longer a pagetable. |
2817 | | * Unshadow it, and recursively unshadow pages that reference it. */ |
2818 | 0 | { |
2819 | 0 | sh_remove_shadows(d, gmfn, 0, 1); |
2820 | 0 | /* XXX TODO: |
2821 | 0 | * Rework this hashtable walker to return a linked-list of all |
2822 | 0 | * the shadows it modified, then do breadth-first recursion |
2823 | 0 | * to find the way up to higher-level tables and unshadow them too. |
2824 | 0 | * |
2825 | 0 | * The current code (just tearing down each page's shadows as we |
2826 | 0 | * detect that it is not a pagetable) is correct, but very slow. |
2827 | 0 | * It means extra emulated writes and slows down removal of mappings. */ |
2828 | 0 | } |
2829 | | |
2830 | | /**************************************************************************/ |
2831 | | |
2832 | | /* Reset the up-pointers of every L3 shadow to 0. |
2833 | | * This is called when l3 shadows stop being pinnable, to clear out all |
2834 | | * the list-head bits so the up-pointer field is properly inititalised. */ |
2835 | | static int sh_clear_up_pointer(struct vcpu *v, mfn_t smfn, mfn_t unused) |
2836 | 0 | { |
2837 | 0 | mfn_to_page(smfn)->up = 0; |
2838 | 0 | return 0; |
2839 | 0 | } |
2840 | | |
2841 | | void sh_reset_l3_up_pointers(struct vcpu *v) |
2842 | 0 | { |
2843 | 0 | static const hash_vcpu_callback_t callbacks[SH_type_unused] = { |
2844 | 0 | NULL, /* none */ |
2845 | 0 | NULL, /* l1_32 */ |
2846 | 0 | NULL, /* fl1_32 */ |
2847 | 0 | NULL, /* l2_32 */ |
2848 | 0 | NULL, /* l1_pae */ |
2849 | 0 | NULL, /* fl1_pae */ |
2850 | 0 | NULL, /* l2_pae */ |
2851 | 0 | NULL, /* l2h_pae */ |
2852 | 0 | NULL, /* l1_64 */ |
2853 | 0 | NULL, /* fl1_64 */ |
2854 | 0 | NULL, /* l2_64 */ |
2855 | 0 | NULL, /* l2h_64 */ |
2856 | 0 | sh_clear_up_pointer, /* l3_64 */ |
2857 | 0 | NULL, /* l4_64 */ |
2858 | 0 | NULL, /* p2m */ |
2859 | 0 | NULL /* unused */ |
2860 | 0 | }; |
2861 | 0 | static const unsigned int callback_mask = SHF_L3_64; |
2862 | 0 |
|
2863 | 0 | hash_vcpu_foreach(v, callback_mask, callbacks, INVALID_MFN); |
2864 | 0 | } |
2865 | | |
2866 | | |
2867 | | /**************************************************************************/ |
2868 | | |
2869 | | static void sh_update_paging_modes(struct vcpu *v) |
2870 | 0 | { |
2871 | 0 | struct domain *d = v->domain; |
2872 | 0 | const struct paging_mode *old_mode = v->arch.paging.mode; |
2873 | 0 |
|
2874 | 0 | ASSERT(paging_locked_by_me(d)); |
2875 | 0 |
|
2876 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) |
2877 | 0 | /* Make sure this vcpu has a virtual TLB array allocated */ |
2878 | 0 | if ( unlikely(!v->arch.paging.vtlb) ) |
2879 | 0 | { |
2880 | 0 | v->arch.paging.vtlb = xzalloc_array(struct shadow_vtlb, VTLB_ENTRIES); |
2881 | 0 | if ( unlikely(!v->arch.paging.vtlb) ) |
2882 | 0 | { |
2883 | 0 | SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n", |
2884 | 0 | d->domain_id, v->vcpu_id); |
2885 | 0 | domain_crash(v->domain); |
2886 | 0 | return; |
2887 | 0 | } |
2888 | 0 | spin_lock_init(&v->arch.paging.vtlb_lock); |
2889 | 0 | } |
2890 | 0 | #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ |
2891 | 0 |
|
2892 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
2893 | 0 | if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) ) |
2894 | 0 | { |
2895 | 0 | int i; |
2896 | 0 | for(i = 0; i < SHADOW_OOS_PAGES; i++) |
2897 | 0 | { |
2898 | 0 | shadow_prealloc(d, SH_type_oos_snapshot, 1); |
2899 | 0 | v->arch.paging.shadow.oos_snapshot[i] = |
2900 | 0 | shadow_alloc(d, SH_type_oos_snapshot, 0); |
2901 | 0 | } |
2902 | 0 | } |
2903 | 0 | #endif /* OOS */ |
2904 | 0 |
|
2905 | 0 | // Valid transitions handled by this function: |
2906 | 0 | // - For PV guests: |
2907 | 0 | // - after a shadow mode has been changed |
2908 | 0 | // - For HVM guests: |
2909 | 0 | // - after a shadow mode has been changed |
2910 | 0 | // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE |
2911 | 0 | // |
2912 | 0 |
|
2913 | 0 | // First, tear down any old shadow tables held by this vcpu. |
2914 | 0 | // |
2915 | 0 | if ( v->arch.paging.mode ) |
2916 | 0 | v->arch.paging.mode->shadow.detach_old_tables(v); |
2917 | 0 |
|
2918 | 0 | if ( !is_pv_domain(d) ) |
2919 | 0 | { |
2920 | 0 | /// |
2921 | 0 | /// HVM guest |
2922 | 0 | /// |
2923 | 0 | ASSERT(shadow_mode_translate(d)); |
2924 | 0 | ASSERT(shadow_mode_external(d)); |
2925 | 0 |
|
2926 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
2927 | 0 | /* Need to resync all our pages now, because if a page goes out |
2928 | 0 | * of sync with paging enabled and is resynced with paging |
2929 | 0 | * disabled, the resync will go wrong. */ |
2930 | 0 | shadow_resync_all(v); |
2931 | 0 | #endif /* OOS */ |
2932 | 0 |
|
2933 | 0 | if ( !hvm_paging_enabled(v) ) |
2934 | 0 | { |
2935 | 0 | /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE |
2936 | 0 | * pagetable for it, mapping 4 GB one-to-one using a single l2 |
2937 | 0 | * page of 1024 superpage mappings */ |
2938 | 0 | v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable; |
2939 | 0 | v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2); |
2940 | 0 | } |
2941 | 0 | else if ( hvm_long_mode_active(v) ) |
2942 | 0 | { |
2943 | 0 | // long mode guest... |
2944 | 0 | v->arch.paging.mode = |
2945 | 0 | &SHADOW_INTERNAL_NAME(sh_paging_mode, 4); |
2946 | 0 | } |
2947 | 0 | else if ( hvm_pae_enabled(v) ) |
2948 | 0 | { |
2949 | 0 | // 32-bit PAE mode guest... |
2950 | 0 | v->arch.paging.mode = |
2951 | 0 | &SHADOW_INTERNAL_NAME(sh_paging_mode, 3); |
2952 | 0 | } |
2953 | 0 | else |
2954 | 0 | { |
2955 | 0 | // 32-bit 2 level guest... |
2956 | 0 | v->arch.paging.mode = |
2957 | 0 | &SHADOW_INTERNAL_NAME(sh_paging_mode, 2); |
2958 | 0 | } |
2959 | 0 |
|
2960 | 0 | if ( pagetable_is_null(v->arch.monitor_table) ) |
2961 | 0 | { |
2962 | 0 | mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v); |
2963 | 0 | v->arch.monitor_table = pagetable_from_mfn(mmfn); |
2964 | 0 | make_cr3(v, mmfn); |
2965 | 0 | hvm_update_host_cr3(v); |
2966 | 0 | } |
2967 | 0 |
|
2968 | 0 | if ( v->arch.paging.mode != old_mode ) |
2969 | 0 | { |
2970 | 0 | SHADOW_PRINTK("new paging mode: %pv pe=%d gl=%u " |
2971 | 0 | "sl=%u (was g=%u s=%u)\n", |
2972 | 0 | v, |
2973 | 0 | is_hvm_domain(d) ? hvm_paging_enabled(v) : 1, |
2974 | 0 | v->arch.paging.mode->guest_levels, |
2975 | 0 | v->arch.paging.mode->shadow.shadow_levels, |
2976 | 0 | old_mode ? old_mode->guest_levels : 0, |
2977 | 0 | old_mode ? old_mode->shadow.shadow_levels : 0); |
2978 | 0 | if ( old_mode && |
2979 | 0 | (v->arch.paging.mode->shadow.shadow_levels != |
2980 | 0 | old_mode->shadow.shadow_levels) ) |
2981 | 0 | { |
2982 | 0 | /* Need to make a new monitor table for the new mode */ |
2983 | 0 | mfn_t new_mfn, old_mfn; |
2984 | 0 |
|
2985 | 0 | if ( v != current && vcpu_runnable(v) ) |
2986 | 0 | { |
2987 | 0 | SHADOW_ERROR("Some third party (%pv) is changing " |
2988 | 0 | "this HVM vcpu's (%pv) paging mode " |
2989 | 0 | "while it is running.\n", |
2990 | 0 | current, v); |
2991 | 0 | /* It's not safe to do that because we can't change |
2992 | 0 | * the host CR3 for a running domain */ |
2993 | 0 | domain_crash(v->domain); |
2994 | 0 | return; |
2995 | 0 | } |
2996 | 0 |
|
2997 | 0 | old_mfn = pagetable_get_mfn(v->arch.monitor_table); |
2998 | 0 | v->arch.monitor_table = pagetable_null(); |
2999 | 0 | new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v); |
3000 | 0 | v->arch.monitor_table = pagetable_from_mfn(new_mfn); |
3001 | 0 | SHADOW_PRINTK("new monitor table %"PRI_mfn "\n", |
3002 | 0 | mfn_x(new_mfn)); |
3003 | 0 |
|
3004 | 0 | /* Don't be running on the old monitor table when we |
3005 | 0 | * pull it down! Switch CR3, and warn the HVM code that |
3006 | 0 | * its host cr3 has changed. */ |
3007 | 0 | make_cr3(v, new_mfn); |
3008 | 0 | if ( v == current ) |
3009 | 0 | write_ptbase(v); |
3010 | 0 | hvm_update_host_cr3(v); |
3011 | 0 | old_mode->shadow.destroy_monitor_table(v, old_mfn); |
3012 | 0 | } |
3013 | 0 | } |
3014 | 0 |
|
3015 | 0 | // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE. |
3016 | 0 | // These are HARD: think about the case where two CPU's have |
3017 | 0 | // different values for CR4.PSE and CR4.PGE at the same time. |
3018 | 0 | // This *does* happen, at least for CR4.PGE... |
3019 | 0 | } |
3020 | 0 |
|
3021 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
3022 | 0 | /* We need to check that all the vcpus have paging enabled to |
3023 | 0 | * unsync PTs. */ |
3024 | 0 | if ( is_hvm_domain(d) && !d->arch.paging.shadow.oos_off ) |
3025 | 0 | { |
3026 | 0 | int pe = 1; |
3027 | 0 | struct vcpu *vptr; |
3028 | 0 |
|
3029 | 0 | for_each_vcpu(d, vptr) |
3030 | 0 | { |
3031 | 0 | if ( !hvm_paging_enabled(vptr) ) |
3032 | 0 | { |
3033 | 0 | pe = 0; |
3034 | 0 | break; |
3035 | 0 | } |
3036 | 0 | } |
3037 | 0 |
|
3038 | 0 | d->arch.paging.shadow.oos_active = pe; |
3039 | 0 | } |
3040 | 0 | #endif /* OOS */ |
3041 | 0 |
|
3042 | 0 | v->arch.paging.mode->update_cr3(v, 0); |
3043 | 0 | } |
3044 | | |
3045 | | void shadow_update_paging_modes(struct vcpu *v) |
3046 | 0 | { |
3047 | 0 | paging_lock(v->domain); |
3048 | 0 | sh_update_paging_modes(v); |
3049 | 0 | paging_unlock(v->domain); |
3050 | 0 | } |
3051 | | |
3052 | | /**************************************************************************/ |
3053 | | /* Turning on and off shadow features */ |
3054 | | |
3055 | | static void sh_new_mode(struct domain *d, u32 new_mode) |
3056 | | /* Inform all the vcpus that the shadow mode has been changed */ |
3057 | 0 | { |
3058 | 0 | struct vcpu *v; |
3059 | 0 |
|
3060 | 0 | ASSERT(paging_locked_by_me(d)); |
3061 | 0 | ASSERT(d != current->domain); |
3062 | 0 |
|
3063 | 0 | d->arch.paging.mode = new_mode; |
3064 | 0 | for_each_vcpu(d, v) |
3065 | 0 | sh_update_paging_modes(v); |
3066 | 0 | } |
3067 | | |
3068 | | int shadow_enable(struct domain *d, u32 mode) |
3069 | | /* Turn on "permanent" shadow features: external, translate, refcount. |
3070 | | * Can only be called once on a domain, and these features cannot be |
3071 | | * disabled. |
3072 | | * Returns 0 for success, -errno for failure. */ |
3073 | 0 | { |
3074 | 0 | unsigned int old_pages; |
3075 | 0 | struct page_info *pg = NULL; |
3076 | 0 | uint32_t *e; |
3077 | 0 | int rv = 0; |
3078 | 0 | struct p2m_domain *p2m = p2m_get_hostp2m(d); |
3079 | 0 |
|
3080 | 0 | mode |= PG_SH_enable; |
3081 | 0 |
|
3082 | 0 | domain_pause(d); |
3083 | 0 |
|
3084 | 0 | /* Sanity check the arguments */ |
3085 | 0 | if ( shadow_mode_enabled(d) ) |
3086 | 0 | { |
3087 | 0 | rv = -EINVAL; |
3088 | 0 | goto out_unlocked; |
3089 | 0 | } |
3090 | 0 |
|
3091 | 0 | /* Init the shadow memory allocation if the user hasn't done so */ |
3092 | 0 | old_pages = d->arch.paging.shadow.total_pages; |
3093 | 0 | if ( old_pages == 0 ) |
3094 | 0 | { |
3095 | 0 | paging_lock(d); |
3096 | 0 | rv = shadow_set_allocation(d, 1024, NULL); /* Use at least 4MB */ |
3097 | 0 | if ( rv != 0 ) |
3098 | 0 | { |
3099 | 0 | shadow_set_allocation(d, 0, NULL); |
3100 | 0 | goto out_locked; |
3101 | 0 | } |
3102 | 0 | paging_unlock(d); |
3103 | 0 | } |
3104 | 0 |
|
3105 | 0 | /* Allow p2m and log-dirty code to borrow shadow memory */ |
3106 | 0 | d->arch.paging.alloc_page = shadow_alloc_p2m_page; |
3107 | 0 | d->arch.paging.free_page = shadow_free_p2m_page; |
3108 | 0 |
|
3109 | 0 | /* Init the P2M table. Must be done before we take the paging lock |
3110 | 0 | * to avoid possible deadlock. */ |
3111 | 0 | if ( mode & PG_translate ) |
3112 | 0 | { |
3113 | 0 | rv = p2m_alloc_table(p2m); |
3114 | 0 | if (rv != 0) |
3115 | 0 | goto out_unlocked; |
3116 | 0 | } |
3117 | 0 |
|
3118 | 0 | /* HVM domains need an extra pagetable for vcpus that think they |
3119 | 0 | * have paging disabled */ |
3120 | 0 | if ( is_hvm_domain(d) ) |
3121 | 0 | { |
3122 | 0 | /* Get a single page from the shadow pool. Take it via the |
3123 | 0 | * P2M interface to make freeing it simpler afterwards. */ |
3124 | 0 | pg = shadow_alloc_p2m_page(d); |
3125 | 0 | if ( pg == NULL ) |
3126 | 0 | { |
3127 | 0 | rv = -ENOMEM; |
3128 | 0 | goto out_unlocked; |
3129 | 0 | } |
3130 | 0 | /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB |
3131 | 0 | * of virtual address space onto the same physical address range */ |
3132 | 0 | e = __map_domain_page(pg); |
3133 | 0 | write_32bit_pse_identmap(e); |
3134 | 0 | unmap_domain_page(e); |
3135 | 0 | pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated; |
3136 | 0 | } |
3137 | 0 |
|
3138 | 0 | paging_lock(d); |
3139 | 0 |
|
3140 | 0 | /* Sanity check again with the lock held */ |
3141 | 0 | if ( shadow_mode_enabled(d) ) |
3142 | 0 | { |
3143 | 0 | rv = -EINVAL; |
3144 | 0 | goto out_locked; |
3145 | 0 | } |
3146 | 0 |
|
3147 | 0 | /* Init the hash table */ |
3148 | 0 | if ( shadow_hash_alloc(d) != 0 ) |
3149 | 0 | { |
3150 | 0 | rv = -ENOMEM; |
3151 | 0 | goto out_locked; |
3152 | 0 | } |
3153 | 0 |
|
3154 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) |
3155 | 0 | /* We assume we're dealing with an older 64bit linux guest until we |
3156 | 0 | * see the guest use more than one l4 per vcpu. */ |
3157 | 0 | d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL; |
3158 | 0 | #endif |
3159 | 0 |
|
3160 | 0 | /* Record the 1-to-1 pagetable we just made */ |
3161 | 0 | if ( is_hvm_domain(d) ) |
3162 | 0 | d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg); |
3163 | 0 |
|
3164 | 0 | /* Update the bits */ |
3165 | 0 | sh_new_mode(d, mode); |
3166 | 0 |
|
3167 | 0 | out_locked: |
3168 | 0 | paging_unlock(d); |
3169 | 0 | out_unlocked: |
3170 | 0 | if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) |
3171 | 0 | p2m_teardown(p2m); |
3172 | 0 | if ( rv != 0 && pg != NULL ) |
3173 | 0 | shadow_free_p2m_page(d, pg); |
3174 | 0 | domain_unpause(d); |
3175 | 0 | return rv; |
3176 | 0 | } |
3177 | | |
3178 | | void shadow_teardown(struct domain *d, bool *preempted) |
3179 | | /* Destroy the shadow pagetables of this domain and free its shadow memory. |
3180 | | * Should only be called for dying domains. */ |
3181 | 0 | { |
3182 | 0 | struct vcpu *v; |
3183 | 0 | mfn_t mfn; |
3184 | 0 | struct page_info *unpaged_pagetable = NULL; |
3185 | 0 |
|
3186 | 0 | ASSERT(d->is_dying); |
3187 | 0 | ASSERT(d != current->domain); |
3188 | 0 |
|
3189 | 0 | paging_lock(d); |
3190 | 0 |
|
3191 | 0 | if ( shadow_mode_enabled(d) ) |
3192 | 0 | { |
3193 | 0 | /* Release the shadow and monitor tables held by each vcpu */ |
3194 | 0 | for_each_vcpu(d, v) |
3195 | 0 | { |
3196 | 0 | if ( v->arch.paging.mode ) |
3197 | 0 | { |
3198 | 0 | v->arch.paging.mode->shadow.detach_old_tables(v); |
3199 | 0 | if ( shadow_mode_external(d) ) |
3200 | 0 | { |
3201 | 0 | mfn = pagetable_get_mfn(v->arch.monitor_table); |
3202 | 0 | if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) ) |
3203 | 0 | v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn); |
3204 | 0 | v->arch.monitor_table = pagetable_null(); |
3205 | 0 | } |
3206 | 0 | } |
3207 | 0 | } |
3208 | 0 | } |
3209 | 0 |
|
3210 | 0 | #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) |
3211 | 0 | /* Free the virtual-TLB array attached to each vcpu */ |
3212 | 0 | for_each_vcpu(d, v) |
3213 | 0 | { |
3214 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) |
3215 | 0 | if ( v->arch.paging.vtlb ) |
3216 | 0 | { |
3217 | 0 | xfree(v->arch.paging.vtlb); |
3218 | 0 | v->arch.paging.vtlb = NULL; |
3219 | 0 | } |
3220 | 0 | #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ |
3221 | 0 |
|
3222 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
3223 | 0 | { |
3224 | 0 | int i; |
3225 | 0 | mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; |
3226 | 0 | for ( i = 0; i < SHADOW_OOS_PAGES; i++ ) |
3227 | 0 | if ( mfn_valid(oos_snapshot[i]) ) |
3228 | 0 | { |
3229 | 0 | shadow_free(d, oos_snapshot[i]); |
3230 | 0 | oos_snapshot[i] = INVALID_MFN; |
3231 | 0 | } |
3232 | 0 | } |
3233 | 0 | #endif /* OOS */ |
3234 | 0 | } |
3235 | 0 | #endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */ |
3236 | 0 |
|
3237 | 0 | if ( d->arch.paging.shadow.total_pages != 0 ) |
3238 | 0 | { |
3239 | 0 | /* Destroy all the shadows and release memory to domheap */ |
3240 | 0 | shadow_set_allocation(d, 0, preempted); |
3241 | 0 |
|
3242 | 0 | if ( preempted && *preempted ) |
3243 | 0 | goto out; |
3244 | 0 |
|
3245 | 0 | /* Release the hash table back to xenheap */ |
3246 | 0 | if (d->arch.paging.shadow.hash_table) |
3247 | 0 | shadow_hash_teardown(d); |
3248 | 0 |
|
3249 | 0 | ASSERT(d->arch.paging.shadow.total_pages == 0); |
3250 | 0 | } |
3251 | 0 |
|
3252 | 0 | /* Free the non-paged-vcpus pagetable; must happen after we've |
3253 | 0 | * destroyed any shadows of it or sh_destroy_shadow will get confused. */ |
3254 | 0 | if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) ) |
3255 | 0 | { |
3256 | 0 | ASSERT(is_hvm_domain(d)); |
3257 | 0 | for_each_vcpu(d, v) |
3258 | 0 | if ( !hvm_paging_enabled(v) ) |
3259 | 0 | v->arch.guest_table = pagetable_null(); |
3260 | 0 | unpaged_pagetable = |
3261 | 0 | pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable); |
3262 | 0 | d->arch.paging.shadow.unpaged_pagetable = pagetable_null(); |
3263 | 0 | } |
3264 | 0 |
|
3265 | 0 | /* We leave the "permanent" shadow modes enabled, but clear the |
3266 | 0 | * log-dirty mode bit. We don't want any more mark_dirty() |
3267 | 0 | * calls now that we've torn down the bitmap */ |
3268 | 0 | d->arch.paging.mode &= ~PG_log_dirty; |
3269 | 0 |
|
3270 | 0 | if (d->arch.hvm_domain.dirty_vram) { |
3271 | 0 | xfree(d->arch.hvm_domain.dirty_vram->sl1ma); |
3272 | 0 | xfree(d->arch.hvm_domain.dirty_vram->dirty_bitmap); |
3273 | 0 | xfree(d->arch.hvm_domain.dirty_vram); |
3274 | 0 | d->arch.hvm_domain.dirty_vram = NULL; |
3275 | 0 | } |
3276 | 0 |
|
3277 | 0 | out: |
3278 | 0 | paging_unlock(d); |
3279 | 0 |
|
3280 | 0 | /* Must be called outside the lock */ |
3281 | 0 | if ( unpaged_pagetable ) |
3282 | 0 | shadow_free_p2m_page(d, unpaged_pagetable); |
3283 | 0 | } |
3284 | | |
3285 | | void shadow_final_teardown(struct domain *d) |
3286 | | /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */ |
3287 | 0 | { |
3288 | 0 | SHADOW_PRINTK("dom %u final teardown starts." |
3289 | 0 | " Shadow pages total = %u, free = %u, p2m=%u\n", |
3290 | 0 | d->domain_id, |
3291 | 0 | d->arch.paging.shadow.total_pages, |
3292 | 0 | d->arch.paging.shadow.free_pages, |
3293 | 0 | d->arch.paging.shadow.p2m_pages); |
3294 | 0 |
|
3295 | 0 | /* Double-check that the domain didn't have any shadow memory. |
3296 | 0 | * It is possible for a domain that never got domain_kill()ed |
3297 | 0 | * to get here with its shadow allocation intact. */ |
3298 | 0 | if ( d->arch.paging.shadow.total_pages != 0 ) |
3299 | 0 | shadow_teardown(d, NULL); |
3300 | 0 |
|
3301 | 0 | /* It is now safe to pull down the p2m map. */ |
3302 | 0 | p2m_teardown(p2m_get_hostp2m(d)); |
3303 | 0 | /* Free any shadow memory that the p2m teardown released */ |
3304 | 0 | paging_lock(d); |
3305 | 0 | shadow_set_allocation(d, 0, NULL); |
3306 | 0 | SHADOW_PRINTK("dom %u final teardown done." |
3307 | 0 | " Shadow pages total = %u, free = %u, p2m=%u\n", |
3308 | 0 | d->domain_id, |
3309 | 0 | d->arch.paging.shadow.total_pages, |
3310 | 0 | d->arch.paging.shadow.free_pages, |
3311 | 0 | d->arch.paging.shadow.p2m_pages); |
3312 | 0 | paging_unlock(d); |
3313 | 0 | } |
3314 | | |
3315 | | static int shadow_one_bit_enable(struct domain *d, u32 mode) |
3316 | | /* Turn on a single shadow mode feature */ |
3317 | 0 | { |
3318 | 0 | ASSERT(paging_locked_by_me(d)); |
3319 | 0 |
|
3320 | 0 | /* Sanity check the call */ |
3321 | 0 | if ( d == current->domain || (d->arch.paging.mode & mode) == mode ) |
3322 | 0 | { |
3323 | 0 | return -EINVAL; |
3324 | 0 | } |
3325 | 0 |
|
3326 | 0 | mode |= PG_SH_enable; |
3327 | 0 |
|
3328 | 0 | if ( d->arch.paging.shadow.total_pages == 0 ) |
3329 | 0 | { |
3330 | 0 | /* Init the shadow memory allocation if the user hasn't done so */ |
3331 | 0 | if ( shadow_set_allocation(d, 1, NULL) != 0 ) |
3332 | 0 | { |
3333 | 0 | shadow_set_allocation(d, 0, NULL); |
3334 | 0 | return -ENOMEM; |
3335 | 0 | } |
3336 | 0 | } |
3337 | 0 |
|
3338 | 0 | /* Allow p2m and log-dirty code to borrow shadow memory */ |
3339 | 0 | d->arch.paging.alloc_page = shadow_alloc_p2m_page; |
3340 | 0 | d->arch.paging.free_page = shadow_free_p2m_page; |
3341 | 0 |
|
3342 | 0 | if ( d->arch.paging.mode == 0 ) |
3343 | 0 | { |
3344 | 0 | /* Init the shadow hash table */ |
3345 | 0 | if ( shadow_hash_alloc(d) != 0 ) |
3346 | 0 | return -ENOMEM; |
3347 | 0 | } |
3348 | 0 |
|
3349 | 0 | /* Update the bits */ |
3350 | 0 | sh_new_mode(d, d->arch.paging.mode | mode); |
3351 | 0 |
|
3352 | 0 | return 0; |
3353 | 0 | } |
3354 | | |
3355 | | static int shadow_one_bit_disable(struct domain *d, u32 mode) |
3356 | | /* Turn off a single shadow mode feature */ |
3357 | 0 | { |
3358 | 0 | struct vcpu *v; |
3359 | 0 | ASSERT(paging_locked_by_me(d)); |
3360 | 0 |
|
3361 | 0 | /* Sanity check the call */ |
3362 | 0 | if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) ) |
3363 | 0 | { |
3364 | 0 | return -EINVAL; |
3365 | 0 | } |
3366 | 0 |
|
3367 | 0 | /* Update the bits */ |
3368 | 0 | sh_new_mode(d, d->arch.paging.mode & ~mode); |
3369 | 0 | if ( d->arch.paging.mode == 0 ) |
3370 | 0 | { |
3371 | 0 | /* Get this domain off shadows */ |
3372 | 0 | SHADOW_PRINTK("un-shadowing of domain %u starts." |
3373 | 0 | " Shadow pages total = %u, free = %u, p2m=%u\n", |
3374 | 0 | d->domain_id, |
3375 | 0 | d->arch.paging.shadow.total_pages, |
3376 | 0 | d->arch.paging.shadow.free_pages, |
3377 | 0 | d->arch.paging.shadow.p2m_pages); |
3378 | 0 | for_each_vcpu(d, v) |
3379 | 0 | { |
3380 | 0 | if ( v->arch.paging.mode ) |
3381 | 0 | v->arch.paging.mode->shadow.detach_old_tables(v); |
3382 | 0 | if ( !(v->arch.flags & TF_kernel_mode) ) |
3383 | 0 | make_cr3(v, pagetable_get_mfn(v->arch.guest_table_user)); |
3384 | 0 | else |
3385 | 0 | make_cr3(v, pagetable_get_mfn(v->arch.guest_table)); |
3386 | 0 |
|
3387 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
3388 | 0 | { |
3389 | 0 | int i; |
3390 | 0 | mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; |
3391 | 0 | for ( i = 0; i < SHADOW_OOS_PAGES; i++ ) |
3392 | 0 | if ( mfn_valid(oos_snapshot[i]) ) |
3393 | 0 | { |
3394 | 0 | shadow_free(d, oos_snapshot[i]); |
3395 | 0 | oos_snapshot[i] = INVALID_MFN; |
3396 | 0 | } |
3397 | 0 | } |
3398 | 0 | #endif /* OOS */ |
3399 | 0 | } |
3400 | 0 |
|
3401 | 0 | /* Pull down the memory allocation */ |
3402 | 0 | if ( shadow_set_allocation(d, 0, NULL) != 0 ) |
3403 | 0 | BUG(); /* In fact, we will have BUG()ed already */ |
3404 | 0 | shadow_hash_teardown(d); |
3405 | 0 | SHADOW_PRINTK("un-shadowing of domain %u done." |
3406 | 0 | " Shadow pages total = %u, free = %u, p2m=%u\n", |
3407 | 0 | d->domain_id, |
3408 | 0 | d->arch.paging.shadow.total_pages, |
3409 | 0 | d->arch.paging.shadow.free_pages, |
3410 | 0 | d->arch.paging.shadow.p2m_pages); |
3411 | 0 | } |
3412 | 0 |
|
3413 | 0 | return 0; |
3414 | 0 | } |
3415 | | |
3416 | | /* Enable/disable ops for the "test" and "log-dirty" modes */ |
3417 | | static int shadow_test_enable(struct domain *d) |
3418 | 0 | { |
3419 | 0 | int ret; |
3420 | 0 |
|
3421 | 0 | domain_pause(d); |
3422 | 0 | paging_lock(d); |
3423 | 0 | ret = shadow_one_bit_enable(d, PG_SH_enable); |
3424 | 0 | paging_unlock(d); |
3425 | 0 | domain_unpause(d); |
3426 | 0 |
|
3427 | 0 | return ret; |
3428 | 0 | } |
3429 | | |
3430 | | static int shadow_test_disable(struct domain *d) |
3431 | 0 | { |
3432 | 0 | int ret; |
3433 | 0 |
|
3434 | 0 | domain_pause(d); |
3435 | 0 | paging_lock(d); |
3436 | 0 | ret = shadow_one_bit_disable(d, PG_SH_enable); |
3437 | 0 | paging_unlock(d); |
3438 | 0 | domain_unpause(d); |
3439 | 0 |
|
3440 | 0 | return ret; |
3441 | 0 | } |
3442 | | |
3443 | | /**************************************************************************/ |
3444 | | /* P2M map manipulations */ |
3445 | | |
3446 | | /* shadow specific code which should be called when P2M table entry is updated |
3447 | | * with new content. It is responsible for update the entry, as well as other |
3448 | | * shadow processing jobs. |
3449 | | */ |
3450 | | |
3451 | | static void sh_unshadow_for_p2m_change(struct domain *d, unsigned long gfn, |
3452 | | l1_pgentry_t *p, l1_pgentry_t new, |
3453 | | unsigned int level) |
3454 | 0 | { |
3455 | 0 | /* The following assertion is to make sure we don't step on 1GB host |
3456 | 0 | * page support of HVM guest. */ |
3457 | 0 | ASSERT(!(level > 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) && |
3458 | 0 | (l1e_get_flags(*p) & _PAGE_PSE))); |
3459 | 0 |
|
3460 | 0 | /* If we're removing an MFN from the p2m, remove it from the shadows too */ |
3461 | 0 | if ( level == 1 ) |
3462 | 0 | { |
3463 | 0 | mfn_t mfn = l1e_get_mfn(*p); |
3464 | 0 | p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p)); |
3465 | 0 | if ( (p2m_is_valid(p2mt) || p2m_is_grant(p2mt)) && mfn_valid(mfn) ) |
3466 | 0 | { |
3467 | 0 | sh_remove_all_shadows_and_parents(d, mfn); |
3468 | 0 | if ( sh_remove_all_mappings(d, mfn, _gfn(gfn)) ) |
3469 | 0 | flush_tlb_mask(d->domain_dirty_cpumask); |
3470 | 0 | } |
3471 | 0 | } |
3472 | 0 |
|
3473 | 0 | /* If we're removing a superpage mapping from the p2m, we need to check |
3474 | 0 | * all the pages covered by it. If they're still there in the new |
3475 | 0 | * scheme, that's OK, but otherwise they must be unshadowed. */ |
3476 | 0 | if ( level == 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) && |
3477 | 0 | (l1e_get_flags(*p) & _PAGE_PSE) ) |
3478 | 0 | { |
3479 | 0 | unsigned int i; |
3480 | 0 | cpumask_t flushmask; |
3481 | 0 | mfn_t omfn = l1e_get_mfn(*p); |
3482 | 0 | mfn_t nmfn = l1e_get_mfn(new); |
3483 | 0 | l1_pgentry_t *npte = NULL; |
3484 | 0 | p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p)); |
3485 | 0 | if ( p2m_is_valid(p2mt) && mfn_valid(omfn) ) |
3486 | 0 | { |
3487 | 0 | cpumask_clear(&flushmask); |
3488 | 0 |
|
3489 | 0 | /* If we're replacing a superpage with a normal L1 page, map it */ |
3490 | 0 | if ( (l1e_get_flags(new) & _PAGE_PRESENT) |
3491 | 0 | && !(l1e_get_flags(new) & _PAGE_PSE) |
3492 | 0 | && mfn_valid(nmfn) ) |
3493 | 0 | npte = map_domain_page(nmfn); |
3494 | 0 |
|
3495 | 0 | for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) |
3496 | 0 | { |
3497 | 0 | if ( !npte |
3498 | 0 | || !p2m_is_ram(p2m_flags_to_type(l1e_get_flags(npte[i]))) |
3499 | 0 | || l1e_get_pfn(npte[i]) != mfn_x(omfn) ) |
3500 | 0 | { |
3501 | 0 | /* This GFN->MFN mapping has gone away */ |
3502 | 0 | sh_remove_all_shadows_and_parents(d, omfn); |
3503 | 0 | if ( sh_remove_all_mappings(d, omfn, |
3504 | 0 | _gfn(gfn + (i << PAGE_SHIFT))) ) |
3505 | 0 | cpumask_or(&flushmask, &flushmask, |
3506 | 0 | d->domain_dirty_cpumask); |
3507 | 0 | } |
3508 | 0 | omfn = _mfn(mfn_x(omfn) + 1); |
3509 | 0 | } |
3510 | 0 | flush_tlb_mask(&flushmask); |
3511 | 0 |
|
3512 | 0 | if ( npte ) |
3513 | 0 | unmap_domain_page(npte); |
3514 | 0 | } |
3515 | 0 | } |
3516 | 0 | } |
3517 | | |
3518 | | void |
3519 | | shadow_write_p2m_entry(struct domain *d, unsigned long gfn, |
3520 | | l1_pgentry_t *p, l1_pgentry_t new, |
3521 | | unsigned int level) |
3522 | 0 | { |
3523 | 0 | paging_lock(d); |
3524 | 0 |
|
3525 | 0 | /* If there are any shadows, update them. But if shadow_teardown() |
3526 | 0 | * has already been called then it's not safe to try. */ |
3527 | 0 | if ( likely(d->arch.paging.shadow.total_pages != 0) ) |
3528 | 0 | sh_unshadow_for_p2m_change(d, gfn, p, new, level); |
3529 | 0 |
|
3530 | 0 | /* Update the entry with new content */ |
3531 | 0 | safe_write_pte(p, new); |
3532 | 0 |
|
3533 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) |
3534 | 0 | /* If we're doing FAST_FAULT_PATH, then shadow mode may have |
3535 | 0 | cached the fact that this is an mmio region in the shadow |
3536 | 0 | page tables. Blow the tables away to remove the cache. |
3537 | 0 | This is pretty heavy handed, but this is a rare operation |
3538 | 0 | (it might happen a dozen times during boot and then never |
3539 | 0 | again), so it doesn't matter too much. */ |
3540 | 0 | if ( d->arch.paging.shadow.has_fast_mmio_entries ) |
3541 | 0 | { |
3542 | 0 | shadow_blow_tables(d); |
3543 | 0 | d->arch.paging.shadow.has_fast_mmio_entries = 0; |
3544 | 0 | } |
3545 | 0 | #endif |
3546 | 0 |
|
3547 | 0 | paging_unlock(d); |
3548 | 0 | } |
3549 | | |
3550 | | /**************************************************************************/ |
3551 | | /* Log-dirty mode support */ |
3552 | | |
3553 | | /* Shadow specific code which is called in paging_log_dirty_enable(). |
3554 | | * Return 0 if no problem found. |
3555 | | */ |
3556 | | static int sh_enable_log_dirty(struct domain *d, bool log_global) |
3557 | 0 | { |
3558 | 0 | int ret; |
3559 | 0 |
|
3560 | 0 | paging_lock(d); |
3561 | 0 | if ( shadow_mode_enabled(d) ) |
3562 | 0 | { |
3563 | 0 | /* This domain already has some shadows: need to clear them out |
3564 | 0 | * of the way to make sure that all references to guest memory are |
3565 | 0 | * properly write-protected */ |
3566 | 0 | shadow_blow_tables(d); |
3567 | 0 | } |
3568 | 0 |
|
3569 | 0 | #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) |
3570 | 0 | /* 32bit PV guests on 64bit xen behave like older 64bit linux: they |
3571 | 0 | * change an l4e instead of cr3 to switch tables. Give them the |
3572 | 0 | * same optimization */ |
3573 | 0 | if ( is_pv_32bit_domain(d) ) |
3574 | 0 | d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL; |
3575 | 0 | #endif |
3576 | 0 |
|
3577 | 0 | ret = shadow_one_bit_enable(d, PG_log_dirty); |
3578 | 0 | paging_unlock(d); |
3579 | 0 |
|
3580 | 0 | return ret; |
3581 | 0 | } |
3582 | | |
3583 | | /* shadow specfic code which is called in paging_log_dirty_disable() */ |
3584 | | static int sh_disable_log_dirty(struct domain *d) |
3585 | 0 | { |
3586 | 0 | int ret; |
3587 | 0 |
|
3588 | 0 | paging_lock(d); |
3589 | 0 | ret = shadow_one_bit_disable(d, PG_log_dirty); |
3590 | 0 | paging_unlock(d); |
3591 | 0 |
|
3592 | 0 | return ret; |
3593 | 0 | } |
3594 | | |
3595 | | /* This function is called when we CLEAN log dirty bitmap. See |
3596 | | * paging_log_dirty_op() for details. |
3597 | | */ |
3598 | | static void sh_clean_dirty_bitmap(struct domain *d) |
3599 | 0 | { |
3600 | 0 | paging_lock(d); |
3601 | 0 | /* Need to revoke write access to the domain's pages again. |
3602 | 0 | * In future, we'll have a less heavy-handed approach to this, |
3603 | 0 | * but for now, we just unshadow everything except Xen. */ |
3604 | 0 | shadow_blow_tables(d); |
3605 | 0 | paging_unlock(d); |
3606 | 0 | } |
3607 | | |
3608 | | |
3609 | | /**************************************************************************/ |
3610 | | /* VRAM dirty tracking support */ |
3611 | | int shadow_track_dirty_vram(struct domain *d, |
3612 | | unsigned long begin_pfn, |
3613 | | unsigned long nr, |
3614 | | XEN_GUEST_HANDLE_PARAM(void) guest_dirty_bitmap) |
3615 | 0 | { |
3616 | 0 | int rc = 0; |
3617 | 0 | unsigned long end_pfn = begin_pfn + nr; |
3618 | 0 | unsigned long dirty_size = (nr + 7) / 8; |
3619 | 0 | int flush_tlb = 0; |
3620 | 0 | unsigned long i; |
3621 | 0 | p2m_type_t t; |
3622 | 0 | struct sh_dirty_vram *dirty_vram; |
3623 | 0 | struct p2m_domain *p2m = p2m_get_hostp2m(d); |
3624 | 0 | uint8_t *dirty_bitmap = NULL; |
3625 | 0 |
|
3626 | 0 | if ( end_pfn < begin_pfn || end_pfn > p2m->max_mapped_pfn + 1 ) |
3627 | 0 | return -EINVAL; |
3628 | 0 |
|
3629 | 0 | /* We perform p2m lookups, so lock the p2m upfront to avoid deadlock */ |
3630 | 0 | p2m_lock(p2m_get_hostp2m(d)); |
3631 | 0 | paging_lock(d); |
3632 | 0 |
|
3633 | 0 | dirty_vram = d->arch.hvm_domain.dirty_vram; |
3634 | 0 |
|
3635 | 0 | if ( dirty_vram && (!nr || |
3636 | 0 | ( begin_pfn != dirty_vram->begin_pfn |
3637 | 0 | || end_pfn != dirty_vram->end_pfn )) ) |
3638 | 0 | { |
3639 | 0 | /* Different tracking, tear the previous down. */ |
3640 | 0 | gdprintk(XENLOG_INFO, "stopping tracking VRAM %lx - %lx\n", dirty_vram->begin_pfn, dirty_vram->end_pfn); |
3641 | 0 | xfree(dirty_vram->sl1ma); |
3642 | 0 | xfree(dirty_vram->dirty_bitmap); |
3643 | 0 | xfree(dirty_vram); |
3644 | 0 | dirty_vram = d->arch.hvm_domain.dirty_vram = NULL; |
3645 | 0 | } |
3646 | 0 |
|
3647 | 0 | if ( !nr ) |
3648 | 0 | goto out; |
3649 | 0 |
|
3650 | 0 | dirty_bitmap = vzalloc(dirty_size); |
3651 | 0 | if ( dirty_bitmap == NULL ) |
3652 | 0 | { |
3653 | 0 | rc = -ENOMEM; |
3654 | 0 | goto out; |
3655 | 0 | } |
3656 | 0 | /* This should happen seldomly (Video mode change), |
3657 | 0 | * no need to be careful. */ |
3658 | 0 | if ( !dirty_vram ) |
3659 | 0 | { |
3660 | 0 | /* Throw away all the shadows rather than walking through them |
3661 | 0 | * up to nr times getting rid of mappings of each pfn */ |
3662 | 0 | shadow_blow_tables(d); |
3663 | 0 |
|
3664 | 0 | gdprintk(XENLOG_INFO, "tracking VRAM %lx - %lx\n", begin_pfn, end_pfn); |
3665 | 0 |
|
3666 | 0 | rc = -ENOMEM; |
3667 | 0 | if ( (dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL ) |
3668 | 0 | goto out; |
3669 | 0 | dirty_vram->begin_pfn = begin_pfn; |
3670 | 0 | dirty_vram->end_pfn = end_pfn; |
3671 | 0 | d->arch.hvm_domain.dirty_vram = dirty_vram; |
3672 | 0 |
|
3673 | 0 | if ( (dirty_vram->sl1ma = xmalloc_array(paddr_t, nr)) == NULL ) |
3674 | 0 | goto out_dirty_vram; |
3675 | 0 | memset(dirty_vram->sl1ma, ~0, sizeof(paddr_t) * nr); |
3676 | 0 |
|
3677 | 0 | if ( (dirty_vram->dirty_bitmap = xzalloc_array(uint8_t, dirty_size)) == NULL ) |
3678 | 0 | goto out_sl1ma; |
3679 | 0 |
|
3680 | 0 | dirty_vram->last_dirty = NOW(); |
3681 | 0 |
|
3682 | 0 | /* Tell the caller that this time we could not track dirty bits. */ |
3683 | 0 | rc = -ENODATA; |
3684 | 0 | } |
3685 | 0 | else if (dirty_vram->last_dirty == -1) |
3686 | 0 | /* still completely clean, just copy our empty bitmap */ |
3687 | 0 | memcpy(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size); |
3688 | 0 | else |
3689 | 0 | { |
3690 | 0 | unsigned long map_mfn = mfn_x(INVALID_MFN); |
3691 | 0 | void *map_sl1p = NULL; |
3692 | 0 |
|
3693 | 0 | /* Iterate over VRAM to track dirty bits. */ |
3694 | 0 | for ( i = 0; i < nr; i++ ) { |
3695 | 0 | mfn_t mfn = get_gfn_query_unlocked(d, begin_pfn + i, &t); |
3696 | 0 | struct page_info *page; |
3697 | 0 | int dirty = 0; |
3698 | 0 | paddr_t sl1ma = dirty_vram->sl1ma[i]; |
3699 | 0 |
|
3700 | 0 | if ( !mfn_eq(mfn, INVALID_MFN) ) |
3701 | 0 | { |
3702 | 0 | dirty = 1; |
3703 | 0 | } |
3704 | 0 | else |
3705 | 0 | { |
3706 | 0 | page = mfn_to_page(mfn); |
3707 | 0 | switch (page->u.inuse.type_info & PGT_count_mask) |
3708 | 0 | { |
3709 | 0 | case 0: |
3710 | 0 | /* No guest reference, nothing to track. */ |
3711 | 0 | break; |
3712 | 0 | case 1: |
3713 | 0 | /* One guest reference. */ |
3714 | 0 | if ( sl1ma == INVALID_PADDR ) |
3715 | 0 | { |
3716 | 0 | /* We don't know which sl1e points to this, too bad. */ |
3717 | 0 | dirty = 1; |
3718 | 0 | /* TODO: Heuristics for finding the single mapping of |
3719 | 0 | * this gmfn */ |
3720 | 0 | flush_tlb |= sh_remove_all_mappings(d, mfn, |
3721 | 0 | _gfn(begin_pfn + i)); |
3722 | 0 | } |
3723 | 0 | else |
3724 | 0 | { |
3725 | 0 | /* Hopefully the most common case: only one mapping, |
3726 | 0 | * whose dirty bit we can use. */ |
3727 | 0 | l1_pgentry_t *sl1e; |
3728 | 0 | unsigned long sl1mfn = paddr_to_pfn(sl1ma); |
3729 | 0 |
|
3730 | 0 | if ( sl1mfn != map_mfn ) |
3731 | 0 | { |
3732 | 0 | if ( map_sl1p ) |
3733 | 0 | unmap_domain_page(map_sl1p); |
3734 | 0 | map_sl1p = map_domain_page(_mfn(sl1mfn)); |
3735 | 0 | map_mfn = sl1mfn; |
3736 | 0 | } |
3737 | 0 | sl1e = map_sl1p + (sl1ma & ~PAGE_MASK); |
3738 | 0 |
|
3739 | 0 | if ( l1e_get_flags(*sl1e) & _PAGE_DIRTY ) |
3740 | 0 | { |
3741 | 0 | dirty = 1; |
3742 | 0 | /* Note: this is atomic, so we may clear a |
3743 | 0 | * _PAGE_ACCESSED set by another processor. */ |
3744 | 0 | l1e_remove_flags(*sl1e, _PAGE_DIRTY); |
3745 | 0 | flush_tlb = 1; |
3746 | 0 | } |
3747 | 0 | } |
3748 | 0 | break; |
3749 | 0 | default: |
3750 | 0 | /* More than one guest reference, |
3751 | 0 | * we don't afford tracking that. */ |
3752 | 0 | dirty = 1; |
3753 | 0 | break; |
3754 | 0 | } |
3755 | 0 | } |
3756 | 0 |
|
3757 | 0 | if ( dirty ) |
3758 | 0 | { |
3759 | 0 | dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8); |
3760 | 0 | dirty_vram->last_dirty = NOW(); |
3761 | 0 | } |
3762 | 0 | } |
3763 | 0 |
|
3764 | 0 | if ( map_sl1p ) |
3765 | 0 | unmap_domain_page(map_sl1p); |
3766 | 0 |
|
3767 | 0 | memcpy(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size); |
3768 | 0 | memset(dirty_vram->dirty_bitmap, 0, dirty_size); |
3769 | 0 | if ( dirty_vram->last_dirty + SECONDS(2) < NOW() ) |
3770 | 0 | { |
3771 | 0 | /* was clean for more than two seconds, try to disable guest |
3772 | 0 | * write access */ |
3773 | 0 | for ( i = begin_pfn; i < end_pfn; i++ ) |
3774 | 0 | { |
3775 | 0 | mfn_t mfn = get_gfn_query_unlocked(d, i, &t); |
3776 | 0 | if ( !mfn_eq(mfn, INVALID_MFN) ) |
3777 | 0 | flush_tlb |= sh_remove_write_access(d, mfn, 1, 0); |
3778 | 0 | } |
3779 | 0 | dirty_vram->last_dirty = -1; |
3780 | 0 | } |
3781 | 0 | } |
3782 | 0 | if ( flush_tlb ) |
3783 | 0 | flush_tlb_mask(d->domain_dirty_cpumask); |
3784 | 0 | goto out; |
3785 | 0 |
|
3786 | 0 | out_sl1ma: |
3787 | 0 | xfree(dirty_vram->sl1ma); |
3788 | 0 | out_dirty_vram: |
3789 | 0 | xfree(dirty_vram); |
3790 | 0 | dirty_vram = d->arch.hvm_domain.dirty_vram = NULL; |
3791 | 0 |
|
3792 | 0 | out: |
3793 | 0 | paging_unlock(d); |
3794 | 0 | if ( rc == 0 && dirty_bitmap != NULL && |
3795 | 0 | copy_to_guest(guest_dirty_bitmap, dirty_bitmap, dirty_size) ) |
3796 | 0 | { |
3797 | 0 | paging_lock(d); |
3798 | 0 | for ( i = 0; i < dirty_size; i++ ) |
3799 | 0 | dirty_vram->dirty_bitmap[i] |= dirty_bitmap[i]; |
3800 | 0 | paging_unlock(d); |
3801 | 0 | rc = -EFAULT; |
3802 | 0 | } |
3803 | 0 | vfree(dirty_bitmap); |
3804 | 0 | p2m_unlock(p2m_get_hostp2m(d)); |
3805 | 0 | return rc; |
3806 | 0 | } |
3807 | | |
3808 | | /**************************************************************************/ |
3809 | | /* Shadow-control XEN_DOMCTL dispatcher */ |
3810 | | |
3811 | | int shadow_domctl(struct domain *d, |
3812 | | struct xen_domctl_shadow_op *sc, |
3813 | | XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) |
3814 | 0 | { |
3815 | 0 | int rc; |
3816 | 0 | bool preempted = false; |
3817 | 0 |
|
3818 | 0 | switch ( sc->op ) |
3819 | 0 | { |
3820 | 0 | case XEN_DOMCTL_SHADOW_OP_OFF: |
3821 | 0 | if ( d->arch.paging.mode == PG_SH_enable ) |
3822 | 0 | if ( (rc = shadow_test_disable(d)) != 0 ) |
3823 | 0 | return rc; |
3824 | 0 | return 0; |
3825 | 0 |
|
3826 | 0 | case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST: |
3827 | 0 | return shadow_test_enable(d); |
3828 | 0 |
|
3829 | 0 | case XEN_DOMCTL_SHADOW_OP_ENABLE: |
3830 | 0 | return paging_enable(d, sc->mode << PG_mode_shift); |
3831 | 0 |
|
3832 | 0 | case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: |
3833 | 0 | sc->mb = shadow_get_allocation(d); |
3834 | 0 | return 0; |
3835 | 0 |
|
3836 | 0 | case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: |
3837 | 0 | paging_lock(d); |
3838 | 0 | if ( sc->mb == 0 && shadow_mode_enabled(d) ) |
3839 | 0 | { |
3840 | 0 | /* Can't set the allocation to zero unless the domain stops using |
3841 | 0 | * shadow pagetables first */ |
3842 | 0 | SHADOW_ERROR("Can't set shadow allocation to zero, domain %u" |
3843 | 0 | " is still using shadows.\n", d->domain_id); |
3844 | 0 | paging_unlock(d); |
3845 | 0 | return -EINVAL; |
3846 | 0 | } |
3847 | 0 | rc = shadow_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted); |
3848 | 0 | paging_unlock(d); |
3849 | 0 | if ( preempted ) |
3850 | 0 | /* Not finished. Set up to re-run the call. */ |
3851 | 0 | rc = hypercall_create_continuation( |
3852 | 0 | __HYPERVISOR_domctl, "h", u_domctl); |
3853 | 0 | else |
3854 | 0 | /* Finished. Return the new allocation */ |
3855 | 0 | sc->mb = shadow_get_allocation(d); |
3856 | 0 | return rc; |
3857 | 0 |
|
3858 | 0 | default: |
3859 | 0 | SHADOW_ERROR("Bad shadow op %u\n", sc->op); |
3860 | 0 | return -EINVAL; |
3861 | 0 | } |
3862 | 0 | } |
3863 | | |
3864 | | |
3865 | | /**************************************************************************/ |
3866 | | /* Auditing shadow tables */ |
3867 | | |
3868 | | #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL |
3869 | | |
3870 | | void shadow_audit_tables(struct vcpu *v) |
3871 | | { |
3872 | | /* Dispatch table for getting per-type functions */ |
3873 | | static const hash_vcpu_callback_t callbacks[SH_type_unused] = { |
3874 | | NULL, /* none */ |
3875 | | SHADOW_INTERNAL_NAME(sh_audit_l1_table, 2), /* l1_32 */ |
3876 | | SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 2), /* fl1_32 */ |
3877 | | SHADOW_INTERNAL_NAME(sh_audit_l2_table, 2), /* l2_32 */ |
3878 | | SHADOW_INTERNAL_NAME(sh_audit_l1_table, 3), /* l1_pae */ |
3879 | | SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 3), /* fl1_pae */ |
3880 | | SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2_pae */ |
3881 | | SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2h_pae */ |
3882 | | SHADOW_INTERNAL_NAME(sh_audit_l1_table, 4), /* l1_64 */ |
3883 | | SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 4), /* fl1_64 */ |
3884 | | SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2_64 */ |
3885 | | SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2h_64 */ |
3886 | | SHADOW_INTERNAL_NAME(sh_audit_l3_table, 4), /* l3_64 */ |
3887 | | SHADOW_INTERNAL_NAME(sh_audit_l4_table, 4), /* l4_64 */ |
3888 | | NULL /* All the rest */ |
3889 | | }; |
3890 | | unsigned int mask; |
3891 | | |
3892 | | if ( !(SHADOW_AUDIT_ENABLE) ) |
3893 | | return; |
3894 | | |
3895 | | #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) |
3896 | | sh_oos_audit(v->domain); |
3897 | | #endif |
3898 | | |
3899 | | if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL ) |
3900 | | mask = SHF_page_type_mask; /* Audit every table in the system */ |
3901 | | else |
3902 | | { |
3903 | | /* Audit only the current mode's tables */ |
3904 | | switch ( v->arch.paging.mode->guest_levels ) |
3905 | | { |
3906 | | case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break; |
3907 | | case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE |
3908 | | |SHF_L2H_PAE); break; |
3909 | | case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64 |
3910 | | |SHF_L3_64|SHF_L4_64); break; |
3911 | | default: BUG(); |
3912 | | } |
3913 | | } |
3914 | | |
3915 | | hash_vcpu_foreach(v, mask, callbacks, INVALID_MFN); |
3916 | | } |
3917 | | |
3918 | | #endif /* Shadow audit */ |
3919 | | |
3920 | | /* |
3921 | | * Local variables: |
3922 | | * mode: C |
3923 | | * c-file-style: "BSD" |
3924 | | * c-basic-offset: 4 |
3925 | | * indent-tabs-mode: nil |
3926 | | * End: |
3927 | | */ |