/root/src/xen/xen/arch/x86/domain.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * arch/x86/domain.c |
3 | | * |
4 | | * x86-specific domain handling (e.g., register setup and context switching). |
5 | | */ |
6 | | |
7 | | /* |
8 | | * Copyright (C) 1995 Linus Torvalds |
9 | | * |
10 | | * Pentium III FXSR, SSE support |
11 | | * Gareth Hughes <gareth@valinux.com>, May 2000 |
12 | | */ |
13 | | |
14 | | #include <xen/init.h> |
15 | | #include <xen/lib.h> |
16 | | #include <xen/errno.h> |
17 | | #include <xen/sched.h> |
18 | | #include <xen/domain.h> |
19 | | #include <xen/smp.h> |
20 | | #include <xen/delay.h> |
21 | | #include <xen/softirq.h> |
22 | | #include <xen/grant_table.h> |
23 | | #include <xen/iocap.h> |
24 | | #include <xen/kernel.h> |
25 | | #include <xen/hypercall.h> |
26 | | #include <xen/multicall.h> |
27 | | #include <xen/irq.h> |
28 | | #include <xen/event.h> |
29 | | #include <xen/console.h> |
30 | | #include <xen/percpu.h> |
31 | | #include <xen/compat.h> |
32 | | #include <xen/acpi.h> |
33 | | #include <xen/pci.h> |
34 | | #include <xen/paging.h> |
35 | | #include <xen/cpu.h> |
36 | | #include <xen/wait.h> |
37 | | #include <xen/guest_access.h> |
38 | | #include <xen/livepatch.h> |
39 | | #include <public/sysctl.h> |
40 | | #include <public/hvm/hvm_vcpu.h> |
41 | | #include <asm/regs.h> |
42 | | #include <asm/mc146818rtc.h> |
43 | | #include <asm/system.h> |
44 | | #include <asm/io.h> |
45 | | #include <asm/processor.h> |
46 | | #include <asm/desc.h> |
47 | | #include <asm/i387.h> |
48 | | #include <asm/xstate.h> |
49 | | #include <asm/cpuidle.h> |
50 | | #include <asm/mpspec.h> |
51 | | #include <asm/ldt.h> |
52 | | #include <asm/hvm/hvm.h> |
53 | | #include <asm/hvm/nestedhvm.h> |
54 | | #include <asm/hvm/support.h> |
55 | | #include <asm/hvm/viridian.h> |
56 | | #include <asm/debugreg.h> |
57 | | #include <asm/msr.h> |
58 | | #include <asm/traps.h> |
59 | | #include <asm/nmi.h> |
60 | | #include <asm/mce.h> |
61 | | #include <asm/amd.h> |
62 | | #include <xen/numa.h> |
63 | | #include <xen/iommu.h> |
64 | | #include <compat/vcpu.h> |
65 | | #include <asm/psr.h> |
66 | | #include <asm/pv/domain.h> |
67 | | #include <asm/pv/mm.h> |
68 | | |
69 | | DEFINE_PER_CPU(struct vcpu *, curr_vcpu); |
70 | | |
71 | | static void default_idle(void); |
72 | | void (*pm_idle) (void) __read_mostly = default_idle; |
73 | | void (*dead_idle) (void) __read_mostly = default_dead_idle; |
74 | | |
75 | | static void default_idle(void) |
76 | 0 | { |
77 | 0 | local_irq_disable(); |
78 | 0 | if ( cpu_is_haltable(smp_processor_id()) ) |
79 | 0 | safe_halt(); |
80 | 0 | else |
81 | 0 | local_irq_enable(); |
82 | 0 | } |
83 | | |
84 | | void default_dead_idle(void) |
85 | 0 | { |
86 | 0 | /* |
87 | 0 | * When going into S3, without flushing caches modified data may be |
88 | 0 | * held by the CPUs spinning here indefinitely, and get discarded by |
89 | 0 | * a subsequent INIT. |
90 | 0 | */ |
91 | 0 | wbinvd(); |
92 | 0 | for ( ; ; ) |
93 | 0 | halt(); |
94 | 0 | } |
95 | | |
96 | | static void play_dead(void) |
97 | 0 | { |
98 | 0 | local_irq_disable(); |
99 | 0 |
|
100 | 0 | /* |
101 | 0 | * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible, |
102 | 0 | * as they may be freed at any time. In this case, heap corruption or |
103 | 0 | * #PF can occur (when heap debugging is enabled). For example, even |
104 | 0 | * printk() can involve tasklet scheduling, which touches per-cpu vars. |
105 | 0 | * |
106 | 0 | * Consider very carefully when adding code to *dead_idle. Most hypervisor |
107 | 0 | * subsystems are unsafe to call. |
108 | 0 | */ |
109 | 0 | cpu_exit_clear(smp_processor_id()); |
110 | 0 |
|
111 | 0 | (*dead_idle)(); |
112 | 0 | } |
113 | | |
114 | | static void idle_loop(void) |
115 | 64.8k | { |
116 | 64.8k | unsigned int cpu = smp_processor_id(); |
117 | 64.8k | |
118 | 64.8k | for ( ; ; ) |
119 | 1.87M | { |
120 | 1.87M | if ( cpu_is_offline(cpu) ) |
121 | 0 | play_dead(); |
122 | 1.87M | |
123 | 1.87M | /* Are we here for running vcpu context tasklets, or for idling? */ |
124 | 1.87M | if ( unlikely(tasklet_work_to_do(cpu)) ) |
125 | 44 | do_tasklet(); |
126 | 1.87M | /* |
127 | 1.87M | * Test softirqs twice --- first to see if should even try scrubbing |
128 | 1.87M | * and then, after it is done, whether softirqs became pending |
129 | 1.87M | * while we were scrubbing. |
130 | 1.87M | */ |
131 | 1.95M | else if ( !softirq_pending(cpu) && !scrub_free_pages() && |
132 | 2.11M | !softirq_pending(cpu) ) |
133 | 2.11M | pm_idle(); |
134 | 1.87M | do_softirq(); |
135 | 1.87M | /* |
136 | 1.87M | * We MUST be last (or before pm_idle). Otherwise after we get the |
137 | 1.87M | * softirq we would execute pm_idle (and sleep) and not patch. |
138 | 1.87M | */ |
139 | 1.87M | check_for_livepatch_work(); |
140 | 1.87M | } |
141 | 64.8k | } |
142 | | |
143 | | void startup_cpu_idle_loop(void) |
144 | 12 | { |
145 | 12 | struct vcpu *v = current; |
146 | 12 | |
147 | 12 | ASSERT(is_idle_vcpu(v)); |
148 | 12 | cpumask_set_cpu(v->processor, v->domain->domain_dirty_cpumask); |
149 | 12 | cpumask_set_cpu(v->processor, v->vcpu_dirty_cpumask); |
150 | 12 | |
151 | 12 | reset_stack_and_jump(idle_loop); |
152 | 12 | } |
153 | | |
154 | | static void noreturn continue_idle_domain(struct vcpu *v) |
155 | 64.8k | { |
156 | 64.8k | reset_stack_and_jump(idle_loop); |
157 | 64.8k | } |
158 | | |
159 | | void dump_pageframe_info(struct domain *d) |
160 | 0 | { |
161 | 0 | struct page_info *page; |
162 | 0 |
|
163 | 0 | printk("Memory pages belonging to domain %u:\n", d->domain_id); |
164 | 0 |
|
165 | 0 | if ( d->tot_pages >= 10 && d->is_dying < DOMDYING_dead ) |
166 | 0 | { |
167 | 0 | printk(" DomPage list too long to display\n"); |
168 | 0 | } |
169 | 0 | else |
170 | 0 | { |
171 | 0 | unsigned long total[MASK_EXTR(PGT_type_mask, PGT_type_mask) + 1] = {}; |
172 | 0 |
|
173 | 0 | spin_lock(&d->page_alloc_lock); |
174 | 0 | page_list_for_each ( page, &d->page_list ) |
175 | 0 | { |
176 | 0 | unsigned int index = MASK_EXTR(page->u.inuse.type_info, |
177 | 0 | PGT_type_mask); |
178 | 0 |
|
179 | 0 | if ( ++total[index] > 16 ) |
180 | 0 | { |
181 | 0 | switch ( page->u.inuse.type_info & PGT_type_mask ) |
182 | 0 | { |
183 | 0 | case PGT_none: |
184 | 0 | case PGT_writable_page: |
185 | 0 | continue; |
186 | 0 | } |
187 | 0 | } |
188 | 0 | printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n", |
189 | 0 | _p(page_to_mfn(page)), |
190 | 0 | page->count_info, page->u.inuse.type_info); |
191 | 0 | } |
192 | 0 | spin_unlock(&d->page_alloc_lock); |
193 | 0 | } |
194 | 0 |
|
195 | 0 | if ( is_hvm_domain(d) ) |
196 | 0 | p2m_pod_dump_data(d); |
197 | 0 |
|
198 | 0 | spin_lock(&d->page_alloc_lock); |
199 | 0 | page_list_for_each ( page, &d->xenpage_list ) |
200 | 0 | { |
201 | 0 | printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n", |
202 | 0 | _p(page_to_mfn(page)), |
203 | 0 | page->count_info, page->u.inuse.type_info); |
204 | 0 | } |
205 | 0 | spin_unlock(&d->page_alloc_lock); |
206 | 0 | } |
207 | | |
208 | | void update_guest_memory_policy(struct vcpu *v, |
209 | | struct guest_memory_policy *policy) |
210 | 0 | { |
211 | 0 | smap_check_policy_t old_smap_policy = v->arch.smap_check_policy; |
212 | 0 | bool old_guest_mode = nestedhvm_is_n2(v); |
213 | 0 | bool new_guest_mode = policy->nested_guest_mode; |
214 | 0 |
|
215 | 0 | v->arch.smap_check_policy = policy->smap_policy; |
216 | 0 | policy->smap_policy = old_smap_policy; |
217 | 0 |
|
218 | 0 | /* |
219 | 0 | * When 'v' is in the nested guest mode, all guest copy |
220 | 0 | * functions/macros which finally call paging_gva_to_gfn() |
221 | 0 | * transfer data to/from L2 guest. If the copy is intended for L1 |
222 | 0 | * guest, we must first clear the nested guest flag (by setting |
223 | 0 | * policy->nested_guest_mode to false) before the copy and then |
224 | 0 | * restore the nested guest flag (by setting |
225 | 0 | * policy->nested_guest_mode to true) after the copy. |
226 | 0 | */ |
227 | 0 | if ( unlikely(old_guest_mode != new_guest_mode) ) |
228 | 0 | { |
229 | 0 | if ( new_guest_mode ) |
230 | 0 | nestedhvm_vcpu_enter_guestmode(v); |
231 | 0 | else |
232 | 0 | nestedhvm_vcpu_exit_guestmode(v); |
233 | 0 | policy->nested_guest_mode = old_guest_mode; |
234 | 0 | } |
235 | 0 | } |
236 | | |
237 | | #ifndef CONFIG_BIGMEM |
238 | | /* |
239 | | * The hole may be at or above the 44-bit boundary, so we need to determine |
240 | | * the total bit count until reaching 32 significant (not squashed out) bits |
241 | | * in PFN representations. |
242 | | * Note that the way "bits" gets initialized/updated/bounds-checked guarantees |
243 | | * that the function will never return zero, and hence will never be called |
244 | | * more than once (which is important due to it being deliberately placed in |
245 | | * .init.text). |
246 | | */ |
247 | | static unsigned int __init noinline _domain_struct_bits(void) |
248 | 1 | { |
249 | 1 | unsigned int bits = 32 + PAGE_SHIFT; |
250 | 1 | unsigned int sig = hweight32(~pfn_hole_mask); |
251 | 1 | unsigned int mask = pfn_hole_mask >> 32; |
252 | 1 | |
253 | 1 | for ( ; bits < BITS_PER_LONG && sig < 32; ++bits, mask >>= 1 ) |
254 | 0 | if ( !(mask & 1) ) |
255 | 0 | ++sig; |
256 | 1 | |
257 | 1 | return bits; |
258 | 1 | } |
259 | | #endif |
260 | | |
261 | | struct domain *alloc_domain_struct(void) |
262 | 5 | { |
263 | 5 | struct domain *d; |
264 | 5 | unsigned int order = get_order_from_bytes(sizeof(*d)); |
265 | 5 | #ifdef CONFIG_BIGMEM |
266 | | const unsigned int bits = 0; |
267 | | #else |
268 | 5 | /* |
269 | 5 | * We pack the PDX of the domain structure into a 32-bit field within |
270 | 5 | * the page_info structure. Hence the MEMF_bits() restriction. |
271 | 5 | */ |
272 | 5 | static unsigned int __read_mostly bits; |
273 | 5 | |
274 | 5 | if ( unlikely(!bits) ) |
275 | 1 | bits = _domain_struct_bits(); |
276 | 5 | #endif |
277 | 5 | |
278 | 5 | |
279 | 5 | #ifndef CONFIG_LOCK_PROFILE |
280 | 5 | BUILD_BUG_ON(sizeof(*d) > PAGE_SIZE); |
281 | 5 | #endif |
282 | 5 | d = alloc_xenheap_pages(order, MEMF_bits(bits)); |
283 | 5 | if ( d != NULL ) |
284 | 5 | { |
285 | 5 | unsigned int sz; |
286 | 5 | |
287 | 10 | for ( sz = 0; sz < (PAGE_SIZE << order); sz += PAGE_SIZE ) |
288 | 5 | clear_page((void *)d + sz); |
289 | 5 | } |
290 | 5 | return d; |
291 | 5 | } |
292 | | |
293 | | void free_domain_struct(struct domain *d) |
294 | 0 | { |
295 | 0 | lock_profile_deregister_struct(LOCKPROF_TYPE_PERDOM, d); |
296 | 0 | free_xenheap_page(d); |
297 | 0 | } |
298 | | |
299 | | struct vcpu *alloc_vcpu_struct(void) |
300 | 24 | { |
301 | 24 | struct vcpu *v; |
302 | 24 | /* |
303 | 24 | * This structure contains embedded PAE PDPTEs, used when an HVM guest |
304 | 24 | * runs on shadow pagetables outside of 64-bit mode. In this case the CPU |
305 | 24 | * may require that the shadow CR3 points below 4GB, and hence the whole |
306 | 24 | * structure must satisfy this restriction. Thus we specify MEMF_bits(32). |
307 | 24 | */ |
308 | 24 | BUILD_BUG_ON(sizeof(*v) > PAGE_SIZE); |
309 | 24 | v = alloc_xenheap_pages(0, MEMF_bits(32)); |
310 | 24 | if ( v != NULL ) |
311 | 24 | clear_page(v); |
312 | 24 | return v; |
313 | 24 | } |
314 | | |
315 | | void free_vcpu_struct(struct vcpu *v) |
316 | 0 | { |
317 | 0 | free_xenheap_page(v); |
318 | 0 | } |
319 | | |
320 | | int vcpu_initialise(struct vcpu *v) |
321 | 24 | { |
322 | 24 | struct domain *d = v->domain; |
323 | 24 | int rc; |
324 | 24 | |
325 | 24 | v->arch.flags = TF_kernel_mode; |
326 | 24 | |
327 | 24 | rc = mapcache_vcpu_init(v); |
328 | 24 | if ( rc ) |
329 | 0 | return rc; |
330 | 24 | |
331 | 24 | if ( !is_idle_domain(d) ) |
332 | 12 | { |
333 | 12 | paging_vcpu_init(v); |
334 | 12 | |
335 | 12 | if ( (rc = vcpu_init_fpu(v)) != 0 ) |
336 | 0 | return rc; |
337 | 12 | |
338 | 12 | vmce_init_vcpu(v); |
339 | 12 | } |
340 | 12 | else if ( (rc = xstate_alloc_save_area(v)) != 0 ) |
341 | 0 | return rc; |
342 | 24 | |
343 | 24 | spin_lock_init(&v->arch.vpmu.vpmu_lock); |
344 | 24 | |
345 | 24 | if ( is_hvm_domain(d) ) |
346 | 12 | rc = hvm_vcpu_initialise(v); |
347 | 12 | else if ( !is_idle_domain(d) ) |
348 | 0 | rc = pv_vcpu_initialise(v); |
349 | 12 | else |
350 | 12 | { |
351 | 12 | /* Idle domain */ |
352 | 12 | v->arch.cr3 = __pa(idle_pg_table); |
353 | 12 | rc = 0; |
354 | 12 | v->arch.msr = ZERO_BLOCK_PTR; /* Catch stray misuses */ |
355 | 12 | } |
356 | 24 | |
357 | 24 | if ( rc ) |
358 | 0 | goto fail; |
359 | 24 | |
360 | 24 | if ( !is_idle_domain(v->domain) ) |
361 | 12 | { |
362 | 12 | vpmu_initialise(v); |
363 | 12 | |
364 | 12 | if ( (rc = init_vcpu_msr_policy(v)) ) |
365 | 0 | goto fail; |
366 | 12 | } |
367 | 24 | |
368 | 24 | return rc; |
369 | 24 | |
370 | 0 | fail: |
371 | 0 | vcpu_destroy_fpu(v); |
372 | 0 | xfree(v->arch.msr); |
373 | 0 | v->arch.msr = NULL; |
374 | 0 |
|
375 | 0 | return rc; |
376 | 24 | } |
377 | | |
378 | | void vcpu_destroy(struct vcpu *v) |
379 | 0 | { |
380 | 0 | xfree(v->arch.vm_event); |
381 | 0 | v->arch.vm_event = NULL; |
382 | 0 |
|
383 | 0 | vcpu_destroy_fpu(v); |
384 | 0 |
|
385 | 0 | if ( !is_idle_domain(v->domain) ) |
386 | 0 | vpmu_destroy(v); |
387 | 0 |
|
388 | 0 | if ( is_hvm_vcpu(v) ) |
389 | 0 | hvm_vcpu_destroy(v); |
390 | 0 | else |
391 | 0 | pv_vcpu_destroy(v); |
392 | 0 | } |
393 | | |
394 | | static bool emulation_flags_ok(const struct domain *d, uint32_t emflags) |
395 | 1 | { |
396 | 1 | |
397 | 1 | if ( is_hvm_domain(d) ) |
398 | 1 | { |
399 | 1 | if ( is_hardware_domain(d) && |
400 | 1 | emflags != (XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC| |
401 | 1 | XEN_X86_EMU_VPCI) ) |
402 | 0 | return false; |
403 | 1 | if ( !is_hardware_domain(d) ) |
404 | 0 | { |
405 | 0 | switch ( emflags ) |
406 | 0 | { |
407 | 0 | case XEN_X86_EMU_ALL & ~XEN_X86_EMU_VPCI: |
408 | 0 | case XEN_X86_EMU_LAPIC: |
409 | 0 | case 0: |
410 | 0 | break; |
411 | 0 | default: |
412 | 0 | return false; |
413 | 0 | } |
414 | 0 | } |
415 | 1 | } |
416 | 0 | else if ( emflags != 0 && emflags != XEN_X86_EMU_PIT ) |
417 | 0 | { |
418 | 0 | /* PV or classic PVH. */ |
419 | 0 | return false; |
420 | 0 | } |
421 | 1 | |
422 | 1 | return true; |
423 | 1 | } |
424 | | |
425 | | int arch_domain_create(struct domain *d, unsigned int domcr_flags, |
426 | | struct xen_arch_domainconfig *config) |
427 | 2 | { |
428 | 2 | bool paging_initialised = false; |
429 | 2 | int rc; |
430 | 2 | |
431 | 2 | if ( config == NULL && !is_idle_domain(d) ) |
432 | 0 | return -EINVAL; |
433 | 2 | |
434 | 2 | d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity); |
435 | 2 | |
436 | 2 | INIT_LIST_HEAD(&d->arch.pdev_list); |
437 | 2 | |
438 | 2 | d->arch.relmem = RELMEM_not_started; |
439 | 2 | INIT_PAGE_LIST_HEAD(&d->arch.relmem_list); |
440 | 2 | |
441 | 2 | if ( d->domain_id && !is_idle_domain(d) && |
442 | 0 | cpu_has_amd_erratum(&boot_cpu_data, AMD_ERRATUM_121) ) |
443 | 0 | { |
444 | 0 | if ( !opt_allow_unsafe ) |
445 | 0 | { |
446 | 0 | printk(XENLOG_G_ERR "Xen does not allow DomU creation on this CPU" |
447 | 0 | " for security reasons.\n"); |
448 | 0 | return -EPERM; |
449 | 0 | } |
450 | 0 | printk(XENLOG_G_WARNING |
451 | 0 | "Dom%d may compromise security on this CPU.\n", |
452 | 0 | d->domain_id); |
453 | 0 | } |
454 | 2 | |
455 | 2 | if ( is_idle_domain(d) ) |
456 | 1 | { |
457 | 1 | d->arch.emulation_flags = 0; |
458 | 1 | d->arch.cpuid = ZERO_BLOCK_PTR; /* Catch stray misuses. */ |
459 | 1 | d->arch.msr = ZERO_BLOCK_PTR; |
460 | 1 | } |
461 | 2 | else |
462 | 1 | { |
463 | 1 | uint32_t emflags; |
464 | 1 | |
465 | 1 | if ( is_hardware_domain(d) && is_pv_domain(d) ) |
466 | 0 | config->emulation_flags |= XEN_X86_EMU_PIT; |
467 | 1 | |
468 | 1 | emflags = config->emulation_flags; |
469 | 1 | if ( emflags & ~XEN_X86_EMU_ALL ) |
470 | 0 | { |
471 | 0 | printk(XENLOG_G_ERR "d%d: Invalid emulation bitmap: %#x\n", |
472 | 0 | d->domain_id, emflags); |
473 | 0 | return -EINVAL; |
474 | 0 | } |
475 | 1 | |
476 | 1 | if ( !emulation_flags_ok(d, emflags) ) |
477 | 0 | { |
478 | 0 | printk(XENLOG_G_ERR "d%d: Xen does not allow %s domain creation " |
479 | 0 | "with the current selection of emulators: %#x\n", |
480 | 0 | d->domain_id, is_hvm_domain(d) ? "HVM" : "PV", emflags); |
481 | 0 | return -EOPNOTSUPP; |
482 | 0 | } |
483 | 1 | d->arch.emulation_flags = emflags; |
484 | 1 | } |
485 | 2 | |
486 | 2 | mapcache_domain_init(d); |
487 | 2 | |
488 | 2 | HYPERVISOR_COMPAT_VIRT_START(d) = |
489 | 2 | is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u; |
490 | 2 | |
491 | 2 | if ( !is_idle_domain(d) ) |
492 | 1 | { |
493 | 1 | /* Need to determine if HAP is enabled before initialising paging */ |
494 | 1 | if ( is_hvm_domain(d) ) |
495 | 1 | d->arch.hvm_domain.hap_enabled = |
496 | 1 | hvm_funcs.hap_supported && (domcr_flags & DOMCRF_hap); |
497 | 1 | |
498 | 1 | if ( (rc = paging_domain_init(d, domcr_flags)) != 0 ) |
499 | 0 | goto fail; |
500 | 1 | paging_initialised = 1; |
501 | 1 | |
502 | 1 | if ( (rc = init_domain_cpuid_policy(d)) ) |
503 | 0 | goto fail; |
504 | 1 | |
505 | 1 | if ( (rc = init_domain_msr_policy(d)) ) |
506 | 0 | goto fail; |
507 | 1 | |
508 | 1 | d->arch.ioport_caps = |
509 | 1 | rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex); |
510 | 1 | rc = -ENOMEM; |
511 | 1 | if ( d->arch.ioport_caps == NULL ) |
512 | 0 | goto fail; |
513 | 1 | |
514 | 1 | /* |
515 | 1 | * The shared_info machine address must fit in a 32-bit field within a |
516 | 1 | * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32). |
517 | 1 | */ |
518 | 1 | if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL ) |
519 | 0 | goto fail; |
520 | 1 | |
521 | 1 | clear_page(d->shared_info); |
522 | 1 | share_xen_page_with_guest( |
523 | 1 | virt_to_page(d->shared_info), d, XENSHARE_writable); |
524 | 1 | |
525 | 1 | if ( (rc = init_domain_irq_mapping(d)) != 0 ) |
526 | 0 | goto fail; |
527 | 1 | |
528 | 1 | if ( (rc = iommu_domain_init(d)) != 0 ) |
529 | 0 | goto fail; |
530 | 1 | } |
531 | 2 | spin_lock_init(&d->arch.e820_lock); |
532 | 2 | |
533 | 2 | psr_domain_init(d); |
534 | 2 | |
535 | 2 | if ( is_hvm_domain(d) ) |
536 | 1 | { |
537 | 1 | if ( (rc = hvm_domain_initialise(d, domcr_flags, config)) != 0 ) |
538 | 0 | goto fail; |
539 | 1 | } |
540 | 1 | else if ( is_idle_domain(d) ) |
541 | 1 | { |
542 | 1 | static const struct arch_csw idle_csw = { |
543 | 1 | .from = paravirt_ctxt_switch_from, |
544 | 1 | .to = paravirt_ctxt_switch_to, |
545 | 1 | .tail = continue_idle_domain, |
546 | 1 | }; |
547 | 1 | |
548 | 1 | d->arch.ctxt_switch = &idle_csw; |
549 | 1 | } |
550 | 1 | else |
551 | 0 | { |
552 | 0 | if ( (rc = pv_domain_initialise(d, domcr_flags, config)) != 0 ) |
553 | 0 | goto fail; |
554 | 0 | } |
555 | 2 | |
556 | 2 | /* initialize default tsc behavior in case tools don't */ |
557 | 2 | tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0); |
558 | 2 | spin_lock_init(&d->arch.vtsc_lock); |
559 | 2 | |
560 | 2 | /* PV/PVH guests get an emulated PIT too for video BIOSes to use. */ |
561 | 2 | pit_init(d, cpu_khz); |
562 | 2 | |
563 | 2 | /* |
564 | 2 | * If the FPU does not save FCS/FDS then we can always |
565 | 2 | * save/restore the 64-bit FIP/FDP and ignore the selectors. |
566 | 2 | */ |
567 | 2 | d->arch.x87_fip_width = cpu_has_fpu_sel ? 0 : 8; |
568 | 2 | |
569 | 2 | return 0; |
570 | 2 | |
571 | 0 | fail: |
572 | 0 | d->is_dying = DOMDYING_dead; |
573 | 0 | psr_domain_free(d); |
574 | 0 | iommu_domain_destroy(d); |
575 | 0 | cleanup_domain_irq_mapping(d); |
576 | 0 | free_xenheap_page(d->shared_info); |
577 | 0 | xfree(d->arch.cpuid); |
578 | 0 | xfree(d->arch.msr); |
579 | 0 | if ( paging_initialised ) |
580 | 0 | paging_final_teardown(d); |
581 | 0 | free_perdomain_mappings(d); |
582 | 0 |
|
583 | 0 | return rc; |
584 | 2 | } |
585 | | |
586 | | void arch_domain_destroy(struct domain *d) |
587 | 0 | { |
588 | 0 | if ( is_hvm_domain(d) ) |
589 | 0 | hvm_domain_destroy(d); |
590 | 0 |
|
591 | 0 | xfree(d->arch.e820); |
592 | 0 | xfree(d->arch.cpuid); |
593 | 0 | xfree(d->arch.msr); |
594 | 0 |
|
595 | 0 | free_domain_pirqs(d); |
596 | 0 | if ( !is_idle_domain(d) ) |
597 | 0 | iommu_domain_destroy(d); |
598 | 0 |
|
599 | 0 | paging_final_teardown(d); |
600 | 0 |
|
601 | 0 | if ( is_pv_domain(d) ) |
602 | 0 | pv_domain_destroy(d); |
603 | 0 | free_perdomain_mappings(d); |
604 | 0 |
|
605 | 0 | free_xenheap_page(d->shared_info); |
606 | 0 | cleanup_domain_irq_mapping(d); |
607 | 0 |
|
608 | 0 | psr_domain_free(d); |
609 | 0 | } |
610 | | |
611 | | void arch_domain_shutdown(struct domain *d) |
612 | 0 | { |
613 | 0 | if ( has_viridian_time_ref_count(d) ) |
614 | 0 | viridian_time_ref_count_freeze(d); |
615 | 0 | } |
616 | | |
617 | | void arch_domain_pause(struct domain *d) |
618 | 1 | { |
619 | 1 | if ( has_viridian_time_ref_count(d) ) |
620 | 0 | viridian_time_ref_count_freeze(d); |
621 | 1 | } |
622 | | |
623 | | void arch_domain_unpause(struct domain *d) |
624 | 2 | { |
625 | 2 | if ( has_viridian_time_ref_count(d) ) |
626 | 0 | viridian_time_ref_count_thaw(d); |
627 | 2 | } |
628 | | |
629 | | int arch_domain_soft_reset(struct domain *d) |
630 | 0 | { |
631 | 0 | struct page_info *page = virt_to_page(d->shared_info), *new_page; |
632 | 0 | int ret = 0; |
633 | 0 | struct domain *owner; |
634 | 0 | unsigned long mfn, gfn; |
635 | 0 | p2m_type_t p2mt; |
636 | 0 | unsigned int i; |
637 | 0 |
|
638 | 0 | /* Soft reset is supported for HVM domains only. */ |
639 | 0 | if ( !is_hvm_domain(d) ) |
640 | 0 | return -EINVAL; |
641 | 0 |
|
642 | 0 | hvm_domain_soft_reset(d); |
643 | 0 |
|
644 | 0 | spin_lock(&d->event_lock); |
645 | 0 | for ( i = 0; i < d->nr_pirqs ; i++ ) |
646 | 0 | { |
647 | 0 | if ( domain_pirq_to_emuirq(d, i) != IRQ_UNBOUND ) |
648 | 0 | { |
649 | 0 | ret = unmap_domain_pirq_emuirq(d, i); |
650 | 0 | if ( ret ) |
651 | 0 | break; |
652 | 0 | } |
653 | 0 | } |
654 | 0 | spin_unlock(&d->event_lock); |
655 | 0 |
|
656 | 0 | if ( ret ) |
657 | 0 | return ret; |
658 | 0 |
|
659 | 0 | /* |
660 | 0 | * The shared_info page needs to be replaced with a new page, otherwise we |
661 | 0 | * will get a hole if the domain does XENMAPSPACE_shared_info. |
662 | 0 | */ |
663 | 0 |
|
664 | 0 | owner = page_get_owner_and_reference(page); |
665 | 0 | ASSERT( owner == d ); |
666 | 0 |
|
667 | 0 | mfn = page_to_mfn(page); |
668 | 0 | gfn = mfn_to_gmfn(d, mfn); |
669 | 0 |
|
670 | 0 | /* |
671 | 0 | * gfn == INVALID_GFN indicates that the shared_info page was never mapped |
672 | 0 | * to the domain's address space and there is nothing to replace. |
673 | 0 | */ |
674 | 0 | if ( gfn == gfn_x(INVALID_GFN) ) |
675 | 0 | goto exit_put_page; |
676 | 0 |
|
677 | 0 | if ( mfn_x(get_gfn_query(d, gfn, &p2mt)) != mfn ) |
678 | 0 | { |
679 | 0 | printk(XENLOG_G_ERR "Failed to get Dom%d's shared_info GFN (%lx)\n", |
680 | 0 | d->domain_id, gfn); |
681 | 0 | ret = -EINVAL; |
682 | 0 | goto exit_put_page; |
683 | 0 | } |
684 | 0 |
|
685 | 0 | new_page = alloc_domheap_page(d, 0); |
686 | 0 | if ( !new_page ) |
687 | 0 | { |
688 | 0 | printk(XENLOG_G_ERR "Failed to alloc a page to replace" |
689 | 0 | " Dom%d's shared_info frame %lx\n", d->domain_id, gfn); |
690 | 0 | ret = -ENOMEM; |
691 | 0 | goto exit_put_gfn; |
692 | 0 | } |
693 | 0 |
|
694 | 0 | ret = guest_physmap_remove_page(d, _gfn(gfn), _mfn(mfn), PAGE_ORDER_4K); |
695 | 0 | if ( ret ) |
696 | 0 | { |
697 | 0 | printk(XENLOG_G_ERR "Failed to remove Dom%d's shared_info frame %lx\n", |
698 | 0 | d->domain_id, gfn); |
699 | 0 | free_domheap_page(new_page); |
700 | 0 | goto exit_put_gfn; |
701 | 0 | } |
702 | 0 |
|
703 | 0 | ret = guest_physmap_add_page(d, _gfn(gfn), _mfn(page_to_mfn(new_page)), |
704 | 0 | PAGE_ORDER_4K); |
705 | 0 | if ( ret ) |
706 | 0 | { |
707 | 0 | printk(XENLOG_G_ERR "Failed to add a page to replace" |
708 | 0 | " Dom%d's shared_info frame %lx\n", d->domain_id, gfn); |
709 | 0 | free_domheap_page(new_page); |
710 | 0 | } |
711 | 0 | exit_put_gfn: |
712 | 0 | put_gfn(d, gfn); |
713 | 0 | exit_put_page: |
714 | 0 | put_page(page); |
715 | 0 |
|
716 | 0 | return ret; |
717 | 0 | } |
718 | | |
719 | | /* |
720 | | * These are the masks of CR4 bits (subject to hardware availability) which a |
721 | | * PV guest may not legitimiately attempt to modify. |
722 | | */ |
723 | | static unsigned long __read_mostly pv_cr4_mask, compat_pv_cr4_mask; |
724 | | |
725 | | static int __init init_pv_cr4_masks(void) |
726 | 1 | { |
727 | 1 | unsigned long common_mask = ~X86_CR4_TSD; |
728 | 1 | |
729 | 1 | /* |
730 | 1 | * All PV guests may attempt to modify TSD, DE and OSXSAVE. |
731 | 1 | */ |
732 | 1 | if ( cpu_has_de ) |
733 | 1 | common_mask &= ~X86_CR4_DE; |
734 | 1 | if ( cpu_has_xsave ) |
735 | 1 | common_mask &= ~X86_CR4_OSXSAVE; |
736 | 1 | |
737 | 1 | pv_cr4_mask = compat_pv_cr4_mask = common_mask; |
738 | 1 | |
739 | 1 | /* |
740 | 1 | * 64bit PV guests may attempt to modify FSGSBASE. |
741 | 1 | */ |
742 | 1 | if ( cpu_has_fsgsbase ) |
743 | 1 | pv_cr4_mask &= ~X86_CR4_FSGSBASE; |
744 | 1 | |
745 | 1 | return 0; |
746 | 1 | } |
747 | | __initcall(init_pv_cr4_masks); |
748 | | |
749 | | unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4) |
750 | 0 | { |
751 | 0 | unsigned long hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4()); |
752 | 0 | unsigned long mask = is_pv_32bit_vcpu(v) ? compat_pv_cr4_mask : pv_cr4_mask; |
753 | 0 |
|
754 | 0 | if ( (guest_cr4 & mask) != (hv_cr4 & mask) ) |
755 | 0 | printk(XENLOG_G_WARNING |
756 | 0 | "d%d attempted to change %pv's CR4 flags %08lx -> %08lx\n", |
757 | 0 | current->domain->domain_id, v, hv_cr4, guest_cr4); |
758 | 0 |
|
759 | 0 | return (hv_cr4 & mask) | (guest_cr4 & ~mask); |
760 | 0 | } |
761 | | |
762 | | #define xen_vcpu_guest_context vcpu_guest_context |
763 | | #define fpu_ctxt fpu_ctxt.x |
764 | | CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt); |
765 | | #undef fpu_ctxt |
766 | | #undef xen_vcpu_guest_context |
767 | | |
768 | | /* Called by XEN_DOMCTL_setvcpucontext and VCPUOP_initialise. */ |
769 | | int arch_set_info_guest( |
770 | | struct vcpu *v, vcpu_guest_context_u c) |
771 | 0 | { |
772 | 0 | struct domain *d = v->domain; |
773 | 0 | unsigned long cr3_gfn; |
774 | 0 | struct page_info *cr3_page; |
775 | 0 | unsigned long flags, cr4; |
776 | 0 | unsigned int i; |
777 | 0 | int rc = 0, compat; |
778 | 0 |
|
779 | 0 | /* The context is a compat-mode one if the target domain is compat-mode; |
780 | 0 | * we expect the tools to DTRT even in compat-mode callers. */ |
781 | 0 | compat = is_pv_32bit_domain(d); |
782 | 0 |
|
783 | 0 | #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld)) |
784 | 0 | flags = c(flags); |
785 | 0 |
|
786 | 0 | if ( is_pv_domain(d) ) |
787 | 0 | { |
788 | 0 | if ( !compat ) |
789 | 0 | { |
790 | 0 | if ( !is_canonical_address(c.nat->user_regs.rip) || |
791 | 0 | !is_canonical_address(c.nat->user_regs.rsp) || |
792 | 0 | !is_canonical_address(c.nat->kernel_sp) || |
793 | 0 | (c.nat->ldt_ents && !is_canonical_address(c.nat->ldt_base)) || |
794 | 0 | !is_canonical_address(c.nat->fs_base) || |
795 | 0 | !is_canonical_address(c.nat->gs_base_kernel) || |
796 | 0 | !is_canonical_address(c.nat->gs_base_user) || |
797 | 0 | !is_canonical_address(c.nat->event_callback_eip) || |
798 | 0 | !is_canonical_address(c.nat->syscall_callback_eip) || |
799 | 0 | !is_canonical_address(c.nat->failsafe_callback_eip) ) |
800 | 0 | return -EINVAL; |
801 | 0 |
|
802 | 0 | fixup_guest_stack_selector(d, c.nat->user_regs.ss); |
803 | 0 | fixup_guest_stack_selector(d, c.nat->kernel_ss); |
804 | 0 | fixup_guest_code_selector(d, c.nat->user_regs.cs); |
805 | 0 |
|
806 | 0 | for ( i = 0; i < ARRAY_SIZE(c.nat->trap_ctxt); i++ ) |
807 | 0 | { |
808 | 0 | if ( !is_canonical_address(c.nat->trap_ctxt[i].address) ) |
809 | 0 | return -EINVAL; |
810 | 0 | fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs); |
811 | 0 | } |
812 | 0 |
|
813 | 0 | if ( !__addr_ok(c.nat->ldt_base) ) |
814 | 0 | return -EINVAL; |
815 | 0 | } |
816 | 0 | else |
817 | 0 | { |
818 | 0 | fixup_guest_stack_selector(d, c.cmp->user_regs.ss); |
819 | 0 | fixup_guest_stack_selector(d, c.cmp->kernel_ss); |
820 | 0 | fixup_guest_code_selector(d, c.cmp->user_regs.cs); |
821 | 0 | fixup_guest_code_selector(d, c.cmp->event_callback_cs); |
822 | 0 | fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs); |
823 | 0 |
|
824 | 0 | for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); i++ ) |
825 | 0 | fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs); |
826 | 0 | } |
827 | 0 |
|
828 | 0 | /* LDT safety checks. */ |
829 | 0 | if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) || |
830 | 0 | (c(ldt_ents) > 8192) ) |
831 | 0 | return -EINVAL; |
832 | 0 | } |
833 | 0 |
|
834 | 0 | v->fpu_initialised = !!(flags & VGCF_I387_VALID); |
835 | 0 |
|
836 | 0 | v->arch.flags &= ~TF_kernel_mode; |
837 | 0 | if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ ) |
838 | 0 | v->arch.flags |= TF_kernel_mode; |
839 | 0 |
|
840 | 0 | v->arch.vgc_flags = flags; |
841 | 0 |
|
842 | 0 | if ( flags & VGCF_I387_VALID ) |
843 | 0 | { |
844 | 0 | memcpy(v->arch.fpu_ctxt, &c.nat->fpu_ctxt, sizeof(c.nat->fpu_ctxt)); |
845 | 0 | if ( v->arch.xsave_area ) |
846 | 0 | v->arch.xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE; |
847 | 0 | } |
848 | 0 | else if ( v->arch.xsave_area ) |
849 | 0 | { |
850 | 0 | v->arch.xsave_area->xsave_hdr.xstate_bv = 0; |
851 | 0 | v->arch.xsave_area->fpu_sse.mxcsr = MXCSR_DEFAULT; |
852 | 0 | } |
853 | 0 | else |
854 | 0 | { |
855 | 0 | typeof(v->arch.xsave_area->fpu_sse) *fpu_sse = v->arch.fpu_ctxt; |
856 | 0 |
|
857 | 0 | memset(fpu_sse, 0, sizeof(*fpu_sse)); |
858 | 0 | fpu_sse->fcw = FCW_DEFAULT; |
859 | 0 | fpu_sse->mxcsr = MXCSR_DEFAULT; |
860 | 0 | } |
861 | 0 | if ( v->arch.xsave_area ) |
862 | 0 | v->arch.xsave_area->xsave_hdr.xcomp_bv = 0; |
863 | 0 |
|
864 | 0 | if ( !compat ) |
865 | 0 | { |
866 | 0 | memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs)); |
867 | 0 | if ( is_pv_domain(d) ) |
868 | 0 | memcpy(v->arch.pv_vcpu.trap_ctxt, c.nat->trap_ctxt, |
869 | 0 | sizeof(c.nat->trap_ctxt)); |
870 | 0 | } |
871 | 0 | else |
872 | 0 | { |
873 | 0 | XLAT_cpu_user_regs(&v->arch.user_regs, &c.cmp->user_regs); |
874 | 0 | if ( is_pv_domain(d) ) |
875 | 0 | { |
876 | 0 | for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i ) |
877 | 0 | XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i, |
878 | 0 | c.cmp->trap_ctxt + i); |
879 | 0 | } |
880 | 0 | } |
881 | 0 |
|
882 | 0 | if ( is_hvm_domain(d) ) |
883 | 0 | { |
884 | 0 | for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) |
885 | 0 | v->arch.debugreg[i] = c(debugreg[i]); |
886 | 0 |
|
887 | 0 | hvm_set_info_guest(v); |
888 | 0 | goto out; |
889 | 0 | } |
890 | 0 |
|
891 | 0 | init_int80_direct_trap(v); |
892 | 0 |
|
893 | 0 | /* IOPL privileges are virtualised. */ |
894 | 0 | v->arch.pv_vcpu.iopl = v->arch.user_regs.eflags & X86_EFLAGS_IOPL; |
895 | 0 | v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL; |
896 | 0 |
|
897 | 0 | /* Ensure real hardware interrupts are enabled. */ |
898 | 0 | v->arch.user_regs.eflags |= X86_EFLAGS_IF; |
899 | 0 |
|
900 | 0 | if ( !v->is_initialised ) |
901 | 0 | { |
902 | 0 | if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] ) |
903 | 0 | return -EINVAL; |
904 | 0 |
|
905 | 0 | v->arch.pv_vcpu.ldt_base = c(ldt_base); |
906 | 0 | v->arch.pv_vcpu.ldt_ents = c(ldt_ents); |
907 | 0 | } |
908 | 0 | else |
909 | 0 | { |
910 | 0 | unsigned long pfn = pagetable_get_pfn(v->arch.guest_table); |
911 | 0 | bool fail; |
912 | 0 |
|
913 | 0 | if ( !compat ) |
914 | 0 | { |
915 | 0 | fail = xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[3]; |
916 | 0 | if ( pagetable_is_null(v->arch.guest_table_user) ) |
917 | 0 | fail |= c.nat->ctrlreg[1] || !(flags & VGCF_in_kernel); |
918 | 0 | else |
919 | 0 | { |
920 | 0 | pfn = pagetable_get_pfn(v->arch.guest_table_user); |
921 | 0 | fail |= xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[1]; |
922 | 0 | } |
923 | 0 | } else { |
924 | 0 | l4_pgentry_t *l4tab = map_domain_page(_mfn(pfn)); |
925 | 0 |
|
926 | 0 | pfn = l4e_get_pfn(*l4tab); |
927 | 0 | unmap_domain_page(l4tab); |
928 | 0 | fail = compat_pfn_to_cr3(pfn) != c.cmp->ctrlreg[3]; |
929 | 0 | } |
930 | 0 |
|
931 | 0 | for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i ) |
932 | 0 | fail |= v->arch.pv_vcpu.gdt_frames[i] != c(gdt_frames[i]); |
933 | 0 | fail |= v->arch.pv_vcpu.gdt_ents != c(gdt_ents); |
934 | 0 |
|
935 | 0 | fail |= v->arch.pv_vcpu.ldt_base != c(ldt_base); |
936 | 0 | fail |= v->arch.pv_vcpu.ldt_ents != c(ldt_ents); |
937 | 0 |
|
938 | 0 | if ( fail ) |
939 | 0 | return -EOPNOTSUPP; |
940 | 0 | } |
941 | 0 |
|
942 | 0 | v->arch.pv_vcpu.kernel_ss = c(kernel_ss); |
943 | 0 | v->arch.pv_vcpu.kernel_sp = c(kernel_sp); |
944 | 0 | for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.ctrlreg); ++i ) |
945 | 0 | v->arch.pv_vcpu.ctrlreg[i] = c(ctrlreg[i]); |
946 | 0 |
|
947 | 0 | v->arch.pv_vcpu.event_callback_eip = c(event_callback_eip); |
948 | 0 | v->arch.pv_vcpu.failsafe_callback_eip = c(failsafe_callback_eip); |
949 | 0 | if ( !compat ) |
950 | 0 | { |
951 | 0 | v->arch.pv_vcpu.syscall_callback_eip = c.nat->syscall_callback_eip; |
952 | 0 | v->arch.pv_vcpu.fs_base = c.nat->fs_base; |
953 | 0 | v->arch.pv_vcpu.gs_base_kernel = c.nat->gs_base_kernel; |
954 | 0 | v->arch.pv_vcpu.gs_base_user = c.nat->gs_base_user; |
955 | 0 | } |
956 | 0 | else |
957 | 0 | { |
958 | 0 | v->arch.pv_vcpu.event_callback_cs = c(event_callback_cs); |
959 | 0 | v->arch.pv_vcpu.failsafe_callback_cs = c(failsafe_callback_cs); |
960 | 0 | } |
961 | 0 |
|
962 | 0 | /* Only CR0.TS is modifiable by guest or admin. */ |
963 | 0 | v->arch.pv_vcpu.ctrlreg[0] &= X86_CR0_TS; |
964 | 0 | v->arch.pv_vcpu.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS; |
965 | 0 |
|
966 | 0 | cr4 = v->arch.pv_vcpu.ctrlreg[4]; |
967 | 0 | v->arch.pv_vcpu.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(v, cr4) : |
968 | 0 | real_cr4_to_pv_guest_cr4(mmu_cr4_features); |
969 | 0 |
|
970 | 0 | memset(v->arch.debugreg, 0, sizeof(v->arch.debugreg)); |
971 | 0 | for ( i = 0; i < 8; i++ ) |
972 | 0 | (void)set_debugreg(v, i, c(debugreg[i])); |
973 | 0 |
|
974 | 0 | if ( v->is_initialised ) |
975 | 0 | goto out; |
976 | 0 |
|
977 | 0 | if ( v->vcpu_id == 0 ) |
978 | 0 | { |
979 | 0 | /* |
980 | 0 | * In the restore case we need to deal with L4 pages which got |
981 | 0 | * initialized with m2p_strict still clear (and which hence lack the |
982 | 0 | * correct initial RO_MPT_VIRT_{START,END} L4 entry). |
983 | 0 | */ |
984 | 0 | if ( d != current->domain && !VM_ASSIST(d, m2p_strict) && |
985 | 0 | is_pv_domain(d) && !is_pv_32bit_domain(d) && |
986 | 0 | test_bit(VMASST_TYPE_m2p_strict, &c.nat->vm_assist) && |
987 | 0 | atomic_read(&d->arch.pv_domain.nr_l4_pages) ) |
988 | 0 | { |
989 | 0 | bool done = false; |
990 | 0 |
|
991 | 0 | spin_lock_recursive(&d->page_alloc_lock); |
992 | 0 |
|
993 | 0 | for ( i = 0; ; ) |
994 | 0 | { |
995 | 0 | struct page_info *page = page_list_remove_head(&d->page_list); |
996 | 0 |
|
997 | 0 | if ( page_lock(page) ) |
998 | 0 | { |
999 | 0 | if ( (page->u.inuse.type_info & PGT_type_mask) == |
1000 | 0 | PGT_l4_page_table ) |
1001 | 0 | done = !fill_ro_mpt(_mfn(page_to_mfn(page))); |
1002 | 0 |
|
1003 | 0 | page_unlock(page); |
1004 | 0 | } |
1005 | 0 |
|
1006 | 0 | page_list_add_tail(page, &d->page_list); |
1007 | 0 |
|
1008 | 0 | if ( done || (!(++i & 0xff) && hypercall_preempt_check()) ) |
1009 | 0 | break; |
1010 | 0 | } |
1011 | 0 |
|
1012 | 0 | spin_unlock_recursive(&d->page_alloc_lock); |
1013 | 0 |
|
1014 | 0 | if ( !done ) |
1015 | 0 | return -ERESTART; |
1016 | 0 | } |
1017 | 0 |
|
1018 | 0 | d->vm_assist = c(vm_assist); |
1019 | 0 | } |
1020 | 0 |
|
1021 | 0 | rc = put_old_guest_table(current); |
1022 | 0 | if ( rc ) |
1023 | 0 | return rc; |
1024 | 0 |
|
1025 | 0 | if ( !compat ) |
1026 | 0 | rc = (int)pv_set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents); |
1027 | 0 | else |
1028 | 0 | { |
1029 | 0 | unsigned long gdt_frames[ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames)]; |
1030 | 0 | unsigned int n = (c.cmp->gdt_ents + 511) / 512; |
1031 | 0 |
|
1032 | 0 | if ( n > ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames) ) |
1033 | 0 | return -EINVAL; |
1034 | 0 | for ( i = 0; i < n; ++i ) |
1035 | 0 | gdt_frames[i] = c.cmp->gdt_frames[i]; |
1036 | 0 | rc = (int)pv_set_gdt(v, gdt_frames, c.cmp->gdt_ents); |
1037 | 0 | } |
1038 | 0 | if ( rc != 0 ) |
1039 | 0 | return rc; |
1040 | 0 |
|
1041 | 0 | set_bit(_VPF_in_reset, &v->pause_flags); |
1042 | 0 |
|
1043 | 0 | if ( !compat ) |
1044 | 0 | cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[3]); |
1045 | 0 | else |
1046 | 0 | cr3_gfn = compat_cr3_to_pfn(c.cmp->ctrlreg[3]); |
1047 | 0 | cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC); |
1048 | 0 |
|
1049 | 0 | if ( !cr3_page ) |
1050 | 0 | rc = -EINVAL; |
1051 | 0 | else if ( paging_mode_refcounts(d) ) |
1052 | 0 | /* nothing */; |
1053 | 0 | else if ( cr3_page == v->arch.old_guest_table ) |
1054 | 0 | { |
1055 | 0 | v->arch.old_guest_table = NULL; |
1056 | 0 | put_page(cr3_page); |
1057 | 0 | } |
1058 | 0 | else |
1059 | 0 | { |
1060 | 0 | if ( !compat ) |
1061 | 0 | rc = put_old_guest_table(v); |
1062 | 0 | if ( !rc ) |
1063 | 0 | rc = get_page_type_preemptible(cr3_page, |
1064 | 0 | !compat ? PGT_root_page_table |
1065 | 0 | : PGT_l3_page_table); |
1066 | 0 | switch ( rc ) |
1067 | 0 | { |
1068 | 0 | case -EINTR: |
1069 | 0 | rc = -ERESTART; |
1070 | 0 | case -ERESTART: |
1071 | 0 | break; |
1072 | 0 | case 0: |
1073 | 0 | if ( !compat && !VM_ASSIST(d, m2p_strict) && |
1074 | 0 | !paging_mode_refcounts(d) ) |
1075 | 0 | fill_ro_mpt(_mfn(cr3_gfn)); |
1076 | 0 | break; |
1077 | 0 | default: |
1078 | 0 | if ( cr3_page == current->arch.old_guest_table ) |
1079 | 0 | cr3_page = NULL; |
1080 | 0 | break; |
1081 | 0 | } |
1082 | 0 | } |
1083 | 0 | if ( rc ) |
1084 | 0 | /* handled below */; |
1085 | 0 | else if ( !compat ) |
1086 | 0 | { |
1087 | 0 | v->arch.guest_table = pagetable_from_page(cr3_page); |
1088 | 0 | if ( c.nat->ctrlreg[1] ) |
1089 | 0 | { |
1090 | 0 | cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[1]); |
1091 | 0 | cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC); |
1092 | 0 |
|
1093 | 0 | if ( !cr3_page ) |
1094 | 0 | rc = -EINVAL; |
1095 | 0 | else if ( !paging_mode_refcounts(d) ) |
1096 | 0 | { |
1097 | 0 | rc = get_page_type_preemptible(cr3_page, PGT_root_page_table); |
1098 | 0 | switch ( rc ) |
1099 | 0 | { |
1100 | 0 | case -EINTR: |
1101 | 0 | rc = -ERESTART; |
1102 | 0 | /* Fallthrough */ |
1103 | 0 | case -ERESTART: |
1104 | 0 | v->arch.old_guest_ptpg = NULL; |
1105 | 0 | v->arch.old_guest_table = |
1106 | 0 | pagetable_get_page(v->arch.guest_table); |
1107 | 0 | v->arch.guest_table = pagetable_null(); |
1108 | 0 | break; |
1109 | 0 | default: |
1110 | 0 | if ( cr3_page == current->arch.old_guest_table ) |
1111 | 0 | cr3_page = NULL; |
1112 | 0 | break; |
1113 | 0 | case 0: |
1114 | 0 | if ( VM_ASSIST(d, m2p_strict) ) |
1115 | 0 | zap_ro_mpt(_mfn(cr3_gfn)); |
1116 | 0 | break; |
1117 | 0 | } |
1118 | 0 | } |
1119 | 0 | if ( !rc ) |
1120 | 0 | v->arch.guest_table_user = pagetable_from_page(cr3_page); |
1121 | 0 | } |
1122 | 0 | } |
1123 | 0 | else |
1124 | 0 | { |
1125 | 0 | l4_pgentry_t *l4tab; |
1126 | 0 |
|
1127 | 0 | l4tab = map_domain_page(pagetable_get_mfn(v->arch.guest_table)); |
1128 | 0 | *l4tab = l4e_from_pfn(page_to_mfn(cr3_page), |
1129 | 0 | _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED); |
1130 | 0 | unmap_domain_page(l4tab); |
1131 | 0 | } |
1132 | 0 | if ( rc ) |
1133 | 0 | { |
1134 | 0 | if ( cr3_page ) |
1135 | 0 | put_page(cr3_page); |
1136 | 0 | pv_destroy_gdt(v); |
1137 | 0 | return rc; |
1138 | 0 | } |
1139 | 0 |
|
1140 | 0 | clear_bit(_VPF_in_reset, &v->pause_flags); |
1141 | 0 |
|
1142 | 0 | if ( v->vcpu_id == 0 ) |
1143 | 0 | update_domain_wallclock_time(d); |
1144 | 0 |
|
1145 | 0 | /* Don't redo final setup */ |
1146 | 0 | v->is_initialised = 1; |
1147 | 0 |
|
1148 | 0 | if ( paging_mode_enabled(d) ) |
1149 | 0 | paging_update_paging_modes(v); |
1150 | 0 |
|
1151 | 0 | update_cr3(v); |
1152 | 0 |
|
1153 | 0 | out: |
1154 | 0 | if ( flags & VGCF_online ) |
1155 | 0 | clear_bit(_VPF_down, &v->pause_flags); |
1156 | 0 | else |
1157 | 0 | set_bit(_VPF_down, &v->pause_flags); |
1158 | 0 | return 0; |
1159 | 0 | #undef c |
1160 | 0 | } |
1161 | | |
1162 | | int arch_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg) |
1163 | 0 | { |
1164 | 0 | int rc; |
1165 | 0 |
|
1166 | 0 | if ( is_hvm_vcpu(v) ) |
1167 | 0 | { |
1168 | 0 | struct domain *d = v->domain; |
1169 | 0 | struct vcpu_hvm_context ctxt; |
1170 | 0 |
|
1171 | 0 | if ( copy_from_guest(&ctxt, arg, 1) ) |
1172 | 0 | return -EFAULT; |
1173 | 0 |
|
1174 | 0 | domain_lock(d); |
1175 | 0 | rc = v->is_initialised ? -EEXIST : arch_set_info_hvm_guest(v, &ctxt); |
1176 | 0 | domain_unlock(d); |
1177 | 0 | } |
1178 | 0 | else |
1179 | 0 | rc = default_initialise_vcpu(v, arg); |
1180 | 0 |
|
1181 | 0 | return rc; |
1182 | 0 | } |
1183 | | |
1184 | | int arch_vcpu_reset(struct vcpu *v) |
1185 | 0 | { |
1186 | 0 | if ( is_pv_vcpu(v) ) |
1187 | 0 | { |
1188 | 0 | pv_destroy_gdt(v); |
1189 | 0 | return vcpu_destroy_pagetables(v); |
1190 | 0 | } |
1191 | 0 |
|
1192 | 0 | vcpu_end_shutdown_deferral(v); |
1193 | 0 | return 0; |
1194 | 0 | } |
1195 | | |
1196 | | long |
1197 | | arch_do_vcpu_op( |
1198 | | int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg) |
1199 | 0 | { |
1200 | 0 | long rc = 0; |
1201 | 0 |
|
1202 | 0 | switch ( cmd ) |
1203 | 0 | { |
1204 | 0 | case VCPUOP_register_vcpu_time_memory_area: |
1205 | 0 | { |
1206 | 0 | struct vcpu_register_time_memory_area area; |
1207 | 0 |
|
1208 | 0 | rc = -EFAULT; |
1209 | 0 | if ( copy_from_guest(&area, arg, 1) ) |
1210 | 0 | break; |
1211 | 0 |
|
1212 | 0 | if ( !guest_handle_okay(area.addr.h, 1) ) |
1213 | 0 | break; |
1214 | 0 |
|
1215 | 0 | rc = 0; |
1216 | 0 | v->arch.time_info_guest = area.addr.h; |
1217 | 0 |
|
1218 | 0 | force_update_vcpu_system_time(v); |
1219 | 0 |
|
1220 | 0 | break; |
1221 | 0 | } |
1222 | 0 |
|
1223 | 0 | case VCPUOP_get_physid: |
1224 | 0 | { |
1225 | 0 | struct vcpu_get_physid cpu_id; |
1226 | 0 |
|
1227 | 0 | rc = -EINVAL; |
1228 | 0 | if ( !is_pinned_vcpu(v) ) |
1229 | 0 | break; |
1230 | 0 |
|
1231 | 0 | cpu_id.phys_id = |
1232 | 0 | (uint64_t)x86_cpu_to_apicid[v->vcpu_id] | |
1233 | 0 | ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32); |
1234 | 0 |
|
1235 | 0 | rc = -EFAULT; |
1236 | 0 | if ( copy_to_guest(arg, &cpu_id, 1) ) |
1237 | 0 | break; |
1238 | 0 |
|
1239 | 0 | rc = 0; |
1240 | 0 | break; |
1241 | 0 | } |
1242 | 0 |
|
1243 | 0 | default: |
1244 | 0 | rc = -ENOSYS; |
1245 | 0 | break; |
1246 | 0 | } |
1247 | 0 |
|
1248 | 0 | return rc; |
1249 | 0 | } |
1250 | | |
1251 | | /* |
1252 | | * Loading a nul selector does not clear bases and limits on AMD CPUs. Be on |
1253 | | * the safe side and re-initialize both to flat segment values before loading |
1254 | | * a nul selector. |
1255 | | */ |
1256 | 0 | #define preload_segment(seg, value) do { \ |
1257 | 0 | if ( !((value) & ~3) && \ |
1258 | 0 | boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) \ |
1259 | 0 | asm volatile ( "movl %k0, %%" #seg \ |
1260 | 0 | :: "r" (FLAT_USER_DS32) ); \ |
1261 | 0 | } while ( false ) |
1262 | | |
1263 | 0 | #define loadsegment(seg,value) ({ \ |
1264 | 0 | int __r = 1; \ |
1265 | 0 | asm volatile ( \ |
1266 | 0 | "1: movl %k1,%%" #seg "\n2:\n" \ |
1267 | 0 | ".section .fixup,\"ax\"\n" \ |
1268 | 0 | "3: xorl %k0,%k0\n" \ |
1269 | 0 | " movl %k0,%%" #seg "\n" \ |
1270 | 0 | " jmp 2b\n" \ |
1271 | 0 | ".previous\n" \ |
1272 | 0 | _ASM_EXTABLE(1b, 3b) \ |
1273 | 0 | : "=r" (__r) : "r" (value), "0" (__r) );\ |
1274 | 0 | __r; }) |
1275 | | |
1276 | | /* |
1277 | | * save_segments() writes a mask of segments which are dirty (non-zero), |
1278 | | * allowing load_segments() to avoid some expensive segment loads and |
1279 | | * MSR writes. |
1280 | | */ |
1281 | | static DEFINE_PER_CPU(unsigned int, dirty_segment_mask); |
1282 | 0 | #define DIRTY_DS 0x01 |
1283 | 0 | #define DIRTY_ES 0x02 |
1284 | 0 | #define DIRTY_FS 0x04 |
1285 | 0 | #define DIRTY_GS 0x08 |
1286 | 0 | #define DIRTY_FS_BASE 0x10 |
1287 | 0 | #define DIRTY_GS_BASE_USER 0x20 |
1288 | | |
1289 | | static void load_segments(struct vcpu *n) |
1290 | 0 | { |
1291 | 0 | struct cpu_user_regs *uregs = &n->arch.user_regs; |
1292 | 0 | int all_segs_okay = 1; |
1293 | 0 | unsigned int dirty_segment_mask, cpu = smp_processor_id(); |
1294 | 0 |
|
1295 | 0 | /* Load and clear the dirty segment mask. */ |
1296 | 0 | dirty_segment_mask = per_cpu(dirty_segment_mask, cpu); |
1297 | 0 | per_cpu(dirty_segment_mask, cpu) = 0; |
1298 | 0 |
|
1299 | 0 | /* Either selector != 0 ==> reload. */ |
1300 | 0 | if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) ) |
1301 | 0 | { |
1302 | 0 | preload_segment(ds, uregs->ds); |
1303 | 0 | all_segs_okay &= loadsegment(ds, uregs->ds); |
1304 | 0 | } |
1305 | 0 |
|
1306 | 0 | /* Either selector != 0 ==> reload. */ |
1307 | 0 | if ( unlikely((dirty_segment_mask & DIRTY_ES) | uregs->es) ) |
1308 | 0 | { |
1309 | 0 | preload_segment(es, uregs->es); |
1310 | 0 | all_segs_okay &= loadsegment(es, uregs->es); |
1311 | 0 | } |
1312 | 0 |
|
1313 | 0 | /* Either selector != 0 ==> reload. */ |
1314 | 0 | if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) ) |
1315 | 0 | { |
1316 | 0 | all_segs_okay &= loadsegment(fs, uregs->fs); |
1317 | 0 | /* non-nul selector updates fs_base */ |
1318 | 0 | if ( uregs->fs & ~3 ) |
1319 | 0 | dirty_segment_mask &= ~DIRTY_FS_BASE; |
1320 | 0 | } |
1321 | 0 |
|
1322 | 0 | /* Either selector != 0 ==> reload. */ |
1323 | 0 | if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) ) |
1324 | 0 | { |
1325 | 0 | all_segs_okay &= loadsegment(gs, uregs->gs); |
1326 | 0 | /* non-nul selector updates gs_base_user */ |
1327 | 0 | if ( uregs->gs & ~3 ) |
1328 | 0 | dirty_segment_mask &= ~DIRTY_GS_BASE_USER; |
1329 | 0 | } |
1330 | 0 |
|
1331 | 0 | if ( !is_pv_32bit_vcpu(n) ) |
1332 | 0 | { |
1333 | 0 | /* This can only be non-zero if selector is NULL. */ |
1334 | 0 | if ( n->arch.pv_vcpu.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) ) |
1335 | 0 | wrfsbase(n->arch.pv_vcpu.fs_base); |
1336 | 0 |
|
1337 | 0 | /* Most kernels have non-zero GS base, so don't bother testing. */ |
1338 | 0 | /* (This is also a serialising instruction, avoiding AMD erratum #88.) */ |
1339 | 0 | wrmsrl(MSR_SHADOW_GS_BASE, n->arch.pv_vcpu.gs_base_kernel); |
1340 | 0 |
|
1341 | 0 | /* This can only be non-zero if selector is NULL. */ |
1342 | 0 | if ( n->arch.pv_vcpu.gs_base_user | |
1343 | 0 | (dirty_segment_mask & DIRTY_GS_BASE_USER) ) |
1344 | 0 | wrgsbase(n->arch.pv_vcpu.gs_base_user); |
1345 | 0 |
|
1346 | 0 | /* If in kernel mode then switch the GS bases around. */ |
1347 | 0 | if ( (n->arch.flags & TF_kernel_mode) ) |
1348 | 0 | asm volatile ( "swapgs" ); |
1349 | 0 | } |
1350 | 0 |
|
1351 | 0 | if ( unlikely(!all_segs_okay) ) |
1352 | 0 | { |
1353 | 0 | struct pv_vcpu *pv = &n->arch.pv_vcpu; |
1354 | 0 | struct cpu_user_regs *regs = guest_cpu_user_regs(); |
1355 | 0 | unsigned long *rsp = |
1356 | 0 | (unsigned long *)(((n->arch.flags & TF_kernel_mode) |
1357 | 0 | ? regs->rsp : pv->kernel_sp) & ~0xf); |
1358 | 0 | unsigned long cs_and_mask, rflags; |
1359 | 0 |
|
1360 | 0 | /* Fold upcall mask and architectural IOPL into RFLAGS.IF. */ |
1361 | 0 | rflags = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL); |
1362 | 0 | rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9; |
1363 | 0 | if ( VM_ASSIST(n->domain, architectural_iopl) ) |
1364 | 0 | rflags |= n->arch.pv_vcpu.iopl; |
1365 | 0 |
|
1366 | 0 | if ( is_pv_32bit_vcpu(n) ) |
1367 | 0 | { |
1368 | 0 | unsigned int *esp = ring_1(regs) ? |
1369 | 0 | (unsigned int *)regs->rsp : |
1370 | 0 | (unsigned int *)pv->kernel_sp; |
1371 | 0 | int ret = 0; |
1372 | 0 |
|
1373 | 0 | /* CS longword also contains full evtchn_upcall_mask. */ |
1374 | 0 | cs_and_mask = (unsigned short)regs->cs | |
1375 | 0 | ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16); |
1376 | 0 |
|
1377 | 0 | if ( !ring_1(regs) ) |
1378 | 0 | { |
1379 | 0 | ret = put_user(regs->ss, esp-1); |
1380 | 0 | ret |= put_user(regs->esp, esp-2); |
1381 | 0 | esp -= 2; |
1382 | 0 | } |
1383 | 0 |
|
1384 | 0 | if ( ret | |
1385 | 0 | put_user(rflags, esp-1) | |
1386 | 0 | put_user(cs_and_mask, esp-2) | |
1387 | 0 | put_user(regs->eip, esp-3) | |
1388 | 0 | put_user(uregs->gs, esp-4) | |
1389 | 0 | put_user(uregs->fs, esp-5) | |
1390 | 0 | put_user(uregs->es, esp-6) | |
1391 | 0 | put_user(uregs->ds, esp-7) ) |
1392 | 0 | { |
1393 | 0 | gprintk(XENLOG_ERR, |
1394 | 0 | "error while creating compat failsafe callback frame\n"); |
1395 | 0 | domain_crash(n->domain); |
1396 | 0 | } |
1397 | 0 |
|
1398 | 0 | if ( n->arch.vgc_flags & VGCF_failsafe_disables_events ) |
1399 | 0 | vcpu_info(n, evtchn_upcall_mask) = 1; |
1400 | 0 |
|
1401 | 0 | regs->entry_vector |= TRAP_syscall; |
1402 | 0 | regs->eflags &= ~(X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT| |
1403 | 0 | X86_EFLAGS_IOPL|X86_EFLAGS_TF); |
1404 | 0 | regs->ss = FLAT_COMPAT_KERNEL_SS; |
1405 | 0 | regs->esp = (unsigned long)(esp-7); |
1406 | 0 | regs->cs = FLAT_COMPAT_KERNEL_CS; |
1407 | 0 | regs->eip = pv->failsafe_callback_eip; |
1408 | 0 | return; |
1409 | 0 | } |
1410 | 0 |
|
1411 | 0 | if ( !(n->arch.flags & TF_kernel_mode) ) |
1412 | 0 | toggle_guest_mode(n); |
1413 | 0 | else |
1414 | 0 | regs->cs &= ~3; |
1415 | 0 |
|
1416 | 0 | /* CS longword also contains full evtchn_upcall_mask. */ |
1417 | 0 | cs_and_mask = (unsigned long)regs->cs | |
1418 | 0 | ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32); |
1419 | 0 |
|
1420 | 0 | if ( put_user(regs->ss, rsp- 1) | |
1421 | 0 | put_user(regs->rsp, rsp- 2) | |
1422 | 0 | put_user(rflags, rsp- 3) | |
1423 | 0 | put_user(cs_and_mask, rsp- 4) | |
1424 | 0 | put_user(regs->rip, rsp- 5) | |
1425 | 0 | put_user(uregs->gs, rsp- 6) | |
1426 | 0 | put_user(uregs->fs, rsp- 7) | |
1427 | 0 | put_user(uregs->es, rsp- 8) | |
1428 | 0 | put_user(uregs->ds, rsp- 9) | |
1429 | 0 | put_user(regs->r11, rsp-10) | |
1430 | 0 | put_user(regs->rcx, rsp-11) ) |
1431 | 0 | { |
1432 | 0 | gprintk(XENLOG_ERR, |
1433 | 0 | "error while creating failsafe callback frame\n"); |
1434 | 0 | domain_crash(n->domain); |
1435 | 0 | } |
1436 | 0 |
|
1437 | 0 | if ( n->arch.vgc_flags & VGCF_failsafe_disables_events ) |
1438 | 0 | vcpu_info(n, evtchn_upcall_mask) = 1; |
1439 | 0 |
|
1440 | 0 | regs->entry_vector |= TRAP_syscall; |
1441 | 0 | regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF| |
1442 | 0 | X86_EFLAGS_NT|X86_EFLAGS_IOPL|X86_EFLAGS_TF); |
1443 | 0 | regs->ss = FLAT_KERNEL_SS; |
1444 | 0 | regs->rsp = (unsigned long)(rsp-11); |
1445 | 0 | regs->cs = FLAT_KERNEL_CS; |
1446 | 0 | regs->rip = pv->failsafe_callback_eip; |
1447 | 0 | } |
1448 | 0 | } |
1449 | | |
1450 | | static void save_segments(struct vcpu *v) |
1451 | 0 | { |
1452 | 0 | struct cpu_user_regs *regs = &v->arch.user_regs; |
1453 | 0 | unsigned int dirty_segment_mask = 0; |
1454 | 0 |
|
1455 | 0 | regs->ds = read_sreg(ds); |
1456 | 0 | regs->es = read_sreg(es); |
1457 | 0 | regs->fs = read_sreg(fs); |
1458 | 0 | regs->gs = read_sreg(gs); |
1459 | 0 |
|
1460 | 0 | if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) ) |
1461 | 0 | { |
1462 | 0 | v->arch.pv_vcpu.fs_base = __rdfsbase(); |
1463 | 0 | if ( v->arch.flags & TF_kernel_mode ) |
1464 | 0 | v->arch.pv_vcpu.gs_base_kernel = __rdgsbase(); |
1465 | 0 | else |
1466 | 0 | v->arch.pv_vcpu.gs_base_user = __rdgsbase(); |
1467 | 0 | } |
1468 | 0 |
|
1469 | 0 | if ( regs->ds ) |
1470 | 0 | dirty_segment_mask |= DIRTY_DS; |
1471 | 0 |
|
1472 | 0 | if ( regs->es ) |
1473 | 0 | dirty_segment_mask |= DIRTY_ES; |
1474 | 0 |
|
1475 | 0 | if ( regs->fs || is_pv_32bit_vcpu(v) ) |
1476 | 0 | { |
1477 | 0 | dirty_segment_mask |= DIRTY_FS; |
1478 | 0 | /* non-nul selector kills fs_base */ |
1479 | 0 | if ( regs->fs & ~3 ) |
1480 | 0 | v->arch.pv_vcpu.fs_base = 0; |
1481 | 0 | } |
1482 | 0 | if ( v->arch.pv_vcpu.fs_base ) |
1483 | 0 | dirty_segment_mask |= DIRTY_FS_BASE; |
1484 | 0 |
|
1485 | 0 | if ( regs->gs || is_pv_32bit_vcpu(v) ) |
1486 | 0 | { |
1487 | 0 | dirty_segment_mask |= DIRTY_GS; |
1488 | 0 | /* non-nul selector kills gs_base_user */ |
1489 | 0 | if ( regs->gs & ~3 ) |
1490 | 0 | v->arch.pv_vcpu.gs_base_user = 0; |
1491 | 0 | } |
1492 | 0 | if ( v->arch.pv_vcpu.gs_base_user ) |
1493 | 0 | dirty_segment_mask |= DIRTY_GS_BASE_USER; |
1494 | 0 |
|
1495 | 0 | this_cpu(dirty_segment_mask) = dirty_segment_mask; |
1496 | 0 | } |
1497 | | |
1498 | | void paravirt_ctxt_switch_from(struct vcpu *v) |
1499 | 0 | { |
1500 | 0 | save_segments(v); |
1501 | 0 |
|
1502 | 0 | /* |
1503 | 0 | * Disable debug breakpoints. We do this aggressively because if we switch |
1504 | 0 | * to an HVM guest we may load DR0-DR3 with values that can cause #DE |
1505 | 0 | * inside Xen, before we get a chance to reload DR7, and this cannot always |
1506 | 0 | * safely be handled. |
1507 | 0 | */ |
1508 | 0 | if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) |
1509 | 0 | write_debugreg(7, 0); |
1510 | 0 | } |
1511 | | |
1512 | | void paravirt_ctxt_switch_to(struct vcpu *v) |
1513 | 0 | { |
1514 | 0 | unsigned long cr4; |
1515 | 0 |
|
1516 | 0 | cr4 = pv_guest_cr4_to_real_cr4(v); |
1517 | 0 | if ( unlikely(cr4 != read_cr4()) ) |
1518 | 0 | write_cr4(cr4); |
1519 | 0 |
|
1520 | 0 | if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) |
1521 | 0 | activate_debugregs(v); |
1522 | 0 |
|
1523 | 0 | if ( (v->domain->arch.tsc_mode == TSC_MODE_PVRDTSCP) && |
1524 | 0 | boot_cpu_has(X86_FEATURE_RDTSCP) ) |
1525 | 0 | write_rdtscp_aux(v->domain->arch.incarnation); |
1526 | 0 | } |
1527 | | |
1528 | | /* Update per-VCPU guest runstate shared memory area (if registered). */ |
1529 | | bool update_runstate_area(struct vcpu *v) |
1530 | 328k | { |
1531 | 328k | bool rc; |
1532 | 328k | struct guest_memory_policy policy = |
1533 | 328k | { .smap_policy = SMAP_CHECK_ENABLED, .nested_guest_mode = false }; |
1534 | 328k | void __user *guest_handle = NULL; |
1535 | 328k | |
1536 | 328k | if ( guest_handle_is_null(runstate_guest(v)) ) |
1537 | 328k | return true; |
1538 | 328k | |
1539 | 127 | update_guest_memory_policy(v, &policy); |
1540 | 127 | |
1541 | 127 | if ( VM_ASSIST(v->domain, runstate_update_flag) ) |
1542 | 0 | { |
1543 | 0 | guest_handle = has_32bit_shinfo(v->domain) |
1544 | 0 | ? &v->runstate_guest.compat.p->state_entry_time + 1 |
1545 | 0 | : &v->runstate_guest.native.p->state_entry_time + 1; |
1546 | 0 | guest_handle--; |
1547 | 0 | v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE; |
1548 | 0 | __raw_copy_to_guest(guest_handle, |
1549 | 0 | (void *)(&v->runstate.state_entry_time + 1) - 1, 1); |
1550 | 0 | smp_wmb(); |
1551 | 0 | } |
1552 | 127 | |
1553 | 127 | if ( has_32bit_shinfo(v->domain) ) |
1554 | 0 | { |
1555 | 0 | struct compat_vcpu_runstate_info info; |
1556 | 0 |
|
1557 | 0 | XLAT_vcpu_runstate_info(&info, &v->runstate); |
1558 | 0 | __copy_to_guest(v->runstate_guest.compat, &info, 1); |
1559 | 0 | rc = true; |
1560 | 0 | } |
1561 | 127 | else |
1562 | 127 | rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) != |
1563 | 127 | sizeof(v->runstate); |
1564 | 127 | |
1565 | 127 | if ( guest_handle ) |
1566 | 0 | { |
1567 | 0 | v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE; |
1568 | 0 | smp_wmb(); |
1569 | 0 | __raw_copy_to_guest(guest_handle, |
1570 | 0 | (void *)(&v->runstate.state_entry_time + 1) - 1, 1); |
1571 | 0 | } |
1572 | 127 | |
1573 | 127 | update_guest_memory_policy(v, &policy); |
1574 | 127 | |
1575 | 127 | return rc; |
1576 | 328k | } |
1577 | | |
1578 | | static void _update_runstate_area(struct vcpu *v) |
1579 | 328k | { |
1580 | 328k | if ( !update_runstate_area(v) && is_pv_vcpu(v) && |
1581 | 0 | !(v->arch.flags & TF_kernel_mode) ) |
1582 | 0 | v->arch.pv_vcpu.need_update_runstate_area = 1; |
1583 | 328k | } |
1584 | | |
1585 | | static inline bool need_full_gdt(const struct domain *d) |
1586 | 118k | { |
1587 | 118k | return is_pv_domain(d) && !is_idle_domain(d); |
1588 | 118k | } |
1589 | | |
1590 | | static void __context_switch(void) |
1591 | 39.5k | { |
1592 | 39.5k | struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); |
1593 | 39.5k | unsigned int cpu = smp_processor_id(); |
1594 | 39.5k | struct vcpu *p = per_cpu(curr_vcpu, cpu); |
1595 | 39.5k | struct vcpu *n = current; |
1596 | 39.5k | struct domain *pd = p->domain, *nd = n->domain; |
1597 | 39.5k | struct desc_struct *gdt; |
1598 | 39.5k | struct desc_ptr gdt_desc; |
1599 | 39.5k | |
1600 | 39.5k | ASSERT(p != n); |
1601 | 39.5k | ASSERT(cpumask_empty(n->vcpu_dirty_cpumask)); |
1602 | 39.5k | |
1603 | 39.5k | if ( !is_idle_domain(pd) ) |
1604 | 36.9k | { |
1605 | 36.9k | memcpy(&p->arch.user_regs, stack_regs, CTXT_SWITCH_STACK_BYTES); |
1606 | 36.9k | vcpu_save_fpu(p); |
1607 | 36.9k | pd->arch.ctxt_switch->from(p); |
1608 | 36.9k | } |
1609 | 39.5k | |
1610 | 39.5k | /* |
1611 | 39.5k | * Mark this CPU in next domain's dirty cpumasks before calling |
1612 | 39.5k | * ctxt_switch_to(). This avoids a race on things like EPT flushing, |
1613 | 39.5k | * which is synchronised on that function. |
1614 | 39.5k | */ |
1615 | 39.5k | if ( pd != nd ) |
1616 | 5.17k | cpumask_set_cpu(cpu, nd->domain_dirty_cpumask); |
1617 | 39.5k | cpumask_set_cpu(cpu, n->vcpu_dirty_cpumask); |
1618 | 39.5k | |
1619 | 39.5k | if ( !is_idle_domain(nd) ) |
1620 | 37.0k | { |
1621 | 37.0k | memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES); |
1622 | 37.0k | if ( cpu_has_xsave ) |
1623 | 37.0k | { |
1624 | 33.3k | u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE; |
1625 | 37.0k | |
1626 | 37.0k | if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) ) |
1627 | 0 | BUG(); |
1628 | 37.0k | |
1629 | 37.0k | if ( cpu_has_xsaves && is_hvm_vcpu(n) ) |
1630 | 0 | set_msr_xss(n->arch.hvm_vcpu.msr_xss); |
1631 | 37.0k | } |
1632 | 37.0k | vcpu_restore_fpu_eager(n); |
1633 | 37.0k | nd->arch.ctxt_switch->to(n); |
1634 | 37.0k | } |
1635 | 39.5k | |
1636 | 39.5k | psr_ctxt_switch_to(nd); |
1637 | 39.5k | |
1638 | 39.5k | gdt = !is_pv_32bit_domain(nd) ? per_cpu(gdt_table, cpu) : |
1639 | 18.4E | per_cpu(compat_gdt_table, cpu); |
1640 | 39.5k | if ( need_full_gdt(nd) ) |
1641 | 0 | { |
1642 | 0 | unsigned long mfn = virt_to_mfn(gdt); |
1643 | 0 | l1_pgentry_t *pl1e = pv_gdt_ptes(n); |
1644 | 0 | unsigned int i; |
1645 | 0 |
|
1646 | 0 | for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ ) |
1647 | 0 | l1e_write(pl1e + FIRST_RESERVED_GDT_PAGE + i, |
1648 | 0 | l1e_from_pfn(mfn + i, __PAGE_HYPERVISOR_RW)); |
1649 | 0 | } |
1650 | 39.5k | |
1651 | 39.5k | if ( need_full_gdt(pd) && |
1652 | 0 | ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(nd)) ) |
1653 | 0 | { |
1654 | 0 | gdt_desc.limit = LAST_RESERVED_GDT_BYTE; |
1655 | 0 | gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY); |
1656 | 0 | asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); |
1657 | 0 | } |
1658 | 39.5k | |
1659 | 39.5k | write_ptbase(n); |
1660 | 39.5k | |
1661 | 39.5k | if ( need_full_gdt(nd) && |
1662 | 0 | ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) ) |
1663 | 0 | { |
1664 | 0 | gdt_desc.limit = LAST_RESERVED_GDT_BYTE; |
1665 | 0 | gdt_desc.base = GDT_VIRT_START(n); |
1666 | 0 | asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); |
1667 | 0 | } |
1668 | 39.5k | |
1669 | 39.5k | if ( pd != nd ) |
1670 | 5.17k | cpumask_clear_cpu(cpu, pd->domain_dirty_cpumask); |
1671 | 39.5k | cpumask_clear_cpu(cpu, p->vcpu_dirty_cpumask); |
1672 | 39.5k | |
1673 | 39.5k | per_cpu(curr_vcpu, cpu) = n; |
1674 | 39.5k | } |
1675 | | |
1676 | | |
1677 | | void context_switch(struct vcpu *prev, struct vcpu *next) |
1678 | 164k | { |
1679 | 164k | unsigned int cpu = smp_processor_id(); |
1680 | 164k | const struct domain *prevd = prev->domain, *nextd = next->domain; |
1681 | 164k | cpumask_t dirty_mask; |
1682 | 164k | |
1683 | 164k | ASSERT(local_irq_is_enabled()); |
1684 | 164k | |
1685 | 164k | cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask); |
1686 | 164k | /* Allow at most one CPU at a time to be dirty. */ |
1687 | 164k | ASSERT(cpumask_weight(&dirty_mask) <= 1); |
1688 | 164k | if ( unlikely(!cpumask_test_cpu(cpu, &dirty_mask) && |
1689 | 164k | !cpumask_empty(&dirty_mask)) ) |
1690 | 30 | { |
1691 | 30 | /* Other cpus call __sync_local_execstate from flush ipi handler. */ |
1692 | 30 | flush_tlb_mask(&dirty_mask); |
1693 | 30 | } |
1694 | 164k | |
1695 | 164k | if ( prev != next ) |
1696 | 164k | { |
1697 | 164k | _update_runstate_area(prev); |
1698 | 164k | vpmu_switch_from(prev); |
1699 | 164k | np2m_schedule(NP2M_SCHEDLE_OUT); |
1700 | 164k | } |
1701 | 164k | |
1702 | 164k | if ( is_hvm_domain(prevd) && !list_empty(&prev->arch.hvm_vcpu.tm_list) ) |
1703 | 0 | pt_save_timer(prev); |
1704 | 164k | |
1705 | 164k | local_irq_disable(); |
1706 | 164k | |
1707 | 164k | set_current(next); |
1708 | 164k | |
1709 | 164k | if ( (per_cpu(curr_vcpu, cpu) == next) || |
1710 | 102k | (is_idle_domain(nextd) && cpu_online(cpu)) ) |
1711 | 127k | { |
1712 | 127k | local_irq_enable(); |
1713 | 127k | } |
1714 | 164k | else |
1715 | 37.0k | { |
1716 | 37.0k | __context_switch(); |
1717 | 37.0k | |
1718 | 37.0k | if ( is_pv_domain(nextd) && |
1719 | 0 | (is_idle_domain(prevd) || |
1720 | 0 | is_hvm_domain(prevd) || |
1721 | 0 | is_pv_32bit_domain(prevd) != is_pv_32bit_domain(nextd)) ) |
1722 | 0 | { |
1723 | 0 | uint64_t efer = read_efer(); |
1724 | 0 | if ( !(efer & EFER_SCE) ) |
1725 | 0 | write_efer(efer | EFER_SCE); |
1726 | 0 | } |
1727 | 37.0k | |
1728 | 37.0k | /* Re-enable interrupts before restoring state which may fault. */ |
1729 | 37.0k | local_irq_enable(); |
1730 | 37.0k | |
1731 | 37.0k | if ( is_pv_domain(nextd) ) |
1732 | 0 | { |
1733 | 0 | load_LDT(next); |
1734 | 0 | load_segments(next); |
1735 | 0 | } |
1736 | 37.0k | |
1737 | 37.0k | ctxt_switch_levelling(next); |
1738 | 37.0k | } |
1739 | 164k | |
1740 | 164k | context_saved(prev); |
1741 | 164k | |
1742 | 164k | if ( prev != next ) |
1743 | 165k | { |
1744 | 165k | _update_runstate_area(next); |
1745 | 165k | |
1746 | 165k | /* Must be done with interrupts enabled */ |
1747 | 165k | vpmu_switch_to(next); |
1748 | 165k | np2m_schedule(NP2M_SCHEDLE_IN); |
1749 | 165k | } |
1750 | 164k | |
1751 | 164k | /* Ensure that the vcpu has an up-to-date time base. */ |
1752 | 164k | update_vcpu_system_time(next); |
1753 | 164k | |
1754 | 164k | /* |
1755 | 164k | * Schedule tail *should* be a terminal function pointer, but leave a |
1756 | 164k | * bug frame around just in case it returns, to save going back into the |
1757 | 164k | * context switching code and leaving a far more subtle crash to diagnose. |
1758 | 164k | */ |
1759 | 164k | nextd->arch.ctxt_switch->tail(next); |
1760 | 164k | BUG(); |
1761 | 164k | } |
1762 | | |
1763 | | void continue_running(struct vcpu *same) |
1764 | 4.42M | { |
1765 | 4.42M | /* See the comment above. */ |
1766 | 4.42M | same->domain->arch.ctxt_switch->tail(same); |
1767 | 4.42M | BUG(); |
1768 | 4.42M | } |
1769 | | |
1770 | | int __sync_local_execstate(void) |
1771 | 27.1M | { |
1772 | 27.1M | unsigned long flags; |
1773 | 27.1M | int switch_required; |
1774 | 27.1M | |
1775 | 27.1M | local_irq_save(flags); |
1776 | 27.1M | |
1777 | 27.1M | switch_required = (this_cpu(curr_vcpu) != current); |
1778 | 27.1M | |
1779 | 27.1M | if ( switch_required ) |
1780 | 2.56k | { |
1781 | 2.56k | ASSERT(current == idle_vcpu[smp_processor_id()]); |
1782 | 2.56k | __context_switch(); |
1783 | 2.56k | } |
1784 | 27.1M | |
1785 | 27.1M | local_irq_restore(flags); |
1786 | 27.1M | |
1787 | 27.1M | return switch_required; |
1788 | 27.1M | } |
1789 | | |
1790 | | void sync_local_execstate(void) |
1791 | 27.1M | { |
1792 | 27.1M | (void)__sync_local_execstate(); |
1793 | 27.1M | } |
1794 | | |
1795 | | void sync_vcpu_execstate(struct vcpu *v) |
1796 | 315 | { |
1797 | 315 | if ( cpumask_test_cpu(smp_processor_id(), v->vcpu_dirty_cpumask) ) |
1798 | 0 | sync_local_execstate(); |
1799 | 315 | |
1800 | 315 | /* Other cpus call __sync_local_execstate from flush ipi handler. */ |
1801 | 315 | flush_tlb_mask(v->vcpu_dirty_cpumask); |
1802 | 315 | } |
1803 | | |
1804 | | static int relinquish_memory( |
1805 | | struct domain *d, struct page_list_head *list, unsigned long type) |
1806 | 0 | { |
1807 | 0 | struct page_info *page; |
1808 | 0 | unsigned long x, y; |
1809 | 0 | int ret = 0; |
1810 | 0 |
|
1811 | 0 | /* Use a recursive lock, as we may enter 'free_domheap_page'. */ |
1812 | 0 | spin_lock_recursive(&d->page_alloc_lock); |
1813 | 0 |
|
1814 | 0 | while ( (page = page_list_remove_head(list)) ) |
1815 | 0 | { |
1816 | 0 | /* Grab a reference to the page so it won't disappear from under us. */ |
1817 | 0 | if ( unlikely(!get_page(page, d)) ) |
1818 | 0 | { |
1819 | 0 | /* Couldn't get a reference -- someone is freeing this page. */ |
1820 | 0 | page_list_add_tail(page, &d->arch.relmem_list); |
1821 | 0 | continue; |
1822 | 0 | } |
1823 | 0 |
|
1824 | 0 | if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) |
1825 | 0 | ret = put_page_and_type_preemptible(page); |
1826 | 0 | switch ( ret ) |
1827 | 0 | { |
1828 | 0 | case 0: |
1829 | 0 | break; |
1830 | 0 | case -ERESTART: |
1831 | 0 | case -EINTR: |
1832 | 0 | ret = -ERESTART; |
1833 | 0 | page_list_add(page, list); |
1834 | 0 | set_bit(_PGT_pinned, &page->u.inuse.type_info); |
1835 | 0 | put_page(page); |
1836 | 0 | goto out; |
1837 | 0 | default: |
1838 | 0 | BUG(); |
1839 | 0 | } |
1840 | 0 |
|
1841 | 0 | if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) |
1842 | 0 | put_page(page); |
1843 | 0 |
|
1844 | 0 | /* |
1845 | 0 | * Forcibly invalidate top-most, still valid page tables at this point |
1846 | 0 | * to break circular 'linear page table' references as well as clean up |
1847 | 0 | * partially validated pages. This is okay because MMU structures are |
1848 | 0 | * not shared across domains and this domain is now dead. Thus top-most |
1849 | 0 | * valid tables are not in use so a non-zero count means circular |
1850 | 0 | * reference or partially validated. |
1851 | 0 | */ |
1852 | 0 | y = page->u.inuse.type_info; |
1853 | 0 | for ( ; ; ) |
1854 | 0 | { |
1855 | 0 | x = y; |
1856 | 0 | if ( likely((x & PGT_type_mask) != type) || |
1857 | 0 | likely(!(x & (PGT_validated|PGT_partial))) ) |
1858 | 0 | break; |
1859 | 0 |
|
1860 | 0 | y = cmpxchg(&page->u.inuse.type_info, x, |
1861 | 0 | x & ~(PGT_validated|PGT_partial)); |
1862 | 0 | if ( likely(y == x) ) |
1863 | 0 | { |
1864 | 0 | /* No need for atomic update of type_info here: noone else updates it. */ |
1865 | 0 | switch ( ret = free_page_type(page, x, 1) ) |
1866 | 0 | { |
1867 | 0 | case 0: |
1868 | 0 | break; |
1869 | 0 | case -EINTR: |
1870 | 0 | page_list_add(page, list); |
1871 | 0 | page->u.inuse.type_info |= PGT_validated; |
1872 | 0 | if ( x & PGT_partial ) |
1873 | 0 | put_page(page); |
1874 | 0 | put_page(page); |
1875 | 0 | ret = -ERESTART; |
1876 | 0 | goto out; |
1877 | 0 | case -ERESTART: |
1878 | 0 | page_list_add(page, list); |
1879 | 0 | page->u.inuse.type_info |= PGT_partial; |
1880 | 0 | if ( x & PGT_partial ) |
1881 | 0 | put_page(page); |
1882 | 0 | goto out; |
1883 | 0 | default: |
1884 | 0 | BUG(); |
1885 | 0 | } |
1886 | 0 | if ( x & PGT_partial ) |
1887 | 0 | { |
1888 | 0 | page->u.inuse.type_info--; |
1889 | 0 | put_page(page); |
1890 | 0 | } |
1891 | 0 | break; |
1892 | 0 | } |
1893 | 0 | } |
1894 | 0 |
|
1895 | 0 | /* Put the page on the list and /then/ potentially free it. */ |
1896 | 0 | page_list_add_tail(page, &d->arch.relmem_list); |
1897 | 0 | put_page(page); |
1898 | 0 |
|
1899 | 0 | if ( hypercall_preempt_check() ) |
1900 | 0 | { |
1901 | 0 | ret = -ERESTART; |
1902 | 0 | goto out; |
1903 | 0 | } |
1904 | 0 | } |
1905 | 0 |
|
1906 | 0 | /* list is empty at this point. */ |
1907 | 0 | page_list_move(list, &d->arch.relmem_list); |
1908 | 0 |
|
1909 | 0 | out: |
1910 | 0 | spin_unlock_recursive(&d->page_alloc_lock); |
1911 | 0 | return ret; |
1912 | 0 | } |
1913 | | |
1914 | | int domain_relinquish_resources(struct domain *d) |
1915 | 0 | { |
1916 | 0 | int ret; |
1917 | 0 | struct vcpu *v; |
1918 | 0 |
|
1919 | 0 | BUG_ON(!cpumask_empty(d->domain_dirty_cpumask)); |
1920 | 0 |
|
1921 | 0 | switch ( d->arch.relmem ) |
1922 | 0 | { |
1923 | 0 | case RELMEM_not_started: |
1924 | 0 | ret = pci_release_devices(d); |
1925 | 0 | if ( ret ) |
1926 | 0 | return ret; |
1927 | 0 |
|
1928 | 0 | /* Tear down paging-assistance stuff. */ |
1929 | 0 | ret = paging_teardown(d); |
1930 | 0 | if ( ret ) |
1931 | 0 | return ret; |
1932 | 0 |
|
1933 | 0 | /* Drop the in-use references to page-table bases. */ |
1934 | 0 | for_each_vcpu ( d, v ) |
1935 | 0 | { |
1936 | 0 | ret = vcpu_destroy_pagetables(v); |
1937 | 0 | if ( ret ) |
1938 | 0 | return ret; |
1939 | 0 | } |
1940 | 0 |
|
1941 | 0 | if ( is_pv_domain(d) ) |
1942 | 0 | { |
1943 | 0 | for_each_vcpu ( d, v ) |
1944 | 0 | { |
1945 | 0 | /* |
1946 | 0 | * Relinquish GDT mappings. No need for explicit unmapping of |
1947 | 0 | * the LDT as it automatically gets squashed with the guest |
1948 | 0 | * mappings. |
1949 | 0 | */ |
1950 | 0 | pv_destroy_gdt(v); |
1951 | 0 | } |
1952 | 0 | } |
1953 | 0 |
|
1954 | 0 | if ( d->arch.pirq_eoi_map != NULL ) |
1955 | 0 | { |
1956 | 0 | unmap_domain_page_global(d->arch.pirq_eoi_map); |
1957 | 0 | put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn)); |
1958 | 0 | d->arch.pirq_eoi_map = NULL; |
1959 | 0 | d->arch.auto_unmask = 0; |
1960 | 0 | } |
1961 | 0 |
|
1962 | 0 | d->arch.relmem = RELMEM_shared; |
1963 | 0 | /* fallthrough */ |
1964 | 0 |
|
1965 | 0 | case RELMEM_shared: |
1966 | 0 |
|
1967 | 0 | if ( is_hvm_domain(d) ) |
1968 | 0 | { |
1969 | 0 | /* If the domain has shared pages, relinquish them allowing |
1970 | 0 | * for preemption. */ |
1971 | 0 | ret = relinquish_shared_pages(d); |
1972 | 0 | if ( ret ) |
1973 | 0 | return ret; |
1974 | 0 | } |
1975 | 0 |
|
1976 | 0 | d->arch.relmem = RELMEM_xen; |
1977 | 0 |
|
1978 | 0 | spin_lock(&d->page_alloc_lock); |
1979 | 0 | page_list_splice(&d->arch.relmem_list, &d->page_list); |
1980 | 0 | INIT_PAGE_LIST_HEAD(&d->arch.relmem_list); |
1981 | 0 | spin_unlock(&d->page_alloc_lock); |
1982 | 0 |
|
1983 | 0 | /* Fallthrough. Relinquish every page of memory. */ |
1984 | 0 | case RELMEM_xen: |
1985 | 0 | ret = relinquish_memory(d, &d->xenpage_list, ~0UL); |
1986 | 0 | if ( ret ) |
1987 | 0 | return ret; |
1988 | 0 | d->arch.relmem = RELMEM_l4; |
1989 | 0 | /* fallthrough */ |
1990 | 0 |
|
1991 | 0 | case RELMEM_l4: |
1992 | 0 | ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table); |
1993 | 0 | if ( ret ) |
1994 | 0 | return ret; |
1995 | 0 | d->arch.relmem = RELMEM_l3; |
1996 | 0 | /* fallthrough */ |
1997 | 0 |
|
1998 | 0 | case RELMEM_l3: |
1999 | 0 | ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table); |
2000 | 0 | if ( ret ) |
2001 | 0 | return ret; |
2002 | 0 | d->arch.relmem = RELMEM_l2; |
2003 | 0 | /* fallthrough */ |
2004 | 0 |
|
2005 | 0 | case RELMEM_l2: |
2006 | 0 | ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table); |
2007 | 0 | if ( ret ) |
2008 | 0 | return ret; |
2009 | 0 | d->arch.relmem = RELMEM_done; |
2010 | 0 | /* fallthrough */ |
2011 | 0 |
|
2012 | 0 | case RELMEM_done: |
2013 | 0 | break; |
2014 | 0 |
|
2015 | 0 | default: |
2016 | 0 | BUG(); |
2017 | 0 | } |
2018 | 0 |
|
2019 | 0 | pit_deinit(d); |
2020 | 0 |
|
2021 | 0 | if ( is_hvm_domain(d) ) |
2022 | 0 | hvm_domain_relinquish_resources(d); |
2023 | 0 |
|
2024 | 0 | return 0; |
2025 | 0 | } |
2026 | | |
2027 | | void arch_dump_domain_info(struct domain *d) |
2028 | 0 | { |
2029 | 0 | paging_dump_domain_info(d); |
2030 | 0 | } |
2031 | | |
2032 | | void arch_dump_vcpu_info(struct vcpu *v) |
2033 | 0 | { |
2034 | 0 | paging_dump_vcpu_info(v); |
2035 | 0 |
|
2036 | 0 | vpmu_dump(v); |
2037 | 0 | } |
2038 | | |
2039 | | void vcpu_kick(struct vcpu *v) |
2040 | 99.6k | { |
2041 | 99.6k | /* |
2042 | 99.6k | * NB1. 'pause_flags' and 'processor' must be checked /after/ update of |
2043 | 99.6k | * pending flag. These values may fluctuate (after all, we hold no |
2044 | 99.6k | * locks) but the key insight is that each change will cause |
2045 | 99.6k | * evtchn_upcall_pending to be polled. |
2046 | 99.6k | * |
2047 | 99.6k | * NB2. We save the running flag across the unblock to avoid a needless |
2048 | 99.6k | * IPI for domains that we IPI'd to unblock. |
2049 | 99.6k | */ |
2050 | 99.6k | bool running = v->is_running; |
2051 | 99.6k | |
2052 | 99.6k | vcpu_unblock(v); |
2053 | 99.6k | if ( running && (in_irq() || (v != current)) ) |
2054 | 34.4k | cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ); |
2055 | 99.6k | } |
2056 | | |
2057 | | void vcpu_mark_events_pending(struct vcpu *v) |
2058 | 99.7k | { |
2059 | 99.7k | int already_pending = test_and_set_bit( |
2060 | 99.7k | 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending)); |
2061 | 99.7k | |
2062 | 99.7k | if ( already_pending ) |
2063 | 65 | return; |
2064 | 99.7k | |
2065 | 99.6k | if ( is_hvm_vcpu(v) ) |
2066 | 99.7k | hvm_assert_evtchn_irq(v); |
2067 | 99.6k | else |
2068 | 18.4E | vcpu_kick(v); |
2069 | 99.6k | } |
2070 | | |
2071 | | static void vcpu_kick_softirq(void) |
2072 | 34.4k | { |
2073 | 34.4k | /* |
2074 | 34.4k | * Nothing to do here: we merely prevent notifiers from racing with checks |
2075 | 34.4k | * executed on return to guest context with interrupts enabled. See, for |
2076 | 34.4k | * example, xxx_intr_assist() executed on return to HVM guest context. |
2077 | 34.4k | */ |
2078 | 34.4k | } |
2079 | | |
2080 | | static int __init init_vcpu_kick_softirq(void) |
2081 | 1 | { |
2082 | 1 | open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq); |
2083 | 1 | return 0; |
2084 | 1 | } |
2085 | | __initcall(init_vcpu_kick_softirq); |
2086 | | |
2087 | | |
2088 | | /* |
2089 | | * Local variables: |
2090 | | * mode: C |
2091 | | * c-file-style: "BSD" |
2092 | | * c-basic-offset: 4 |
2093 | | * tab-width: 4 |
2094 | | * indent-tabs-mode: nil |
2095 | | * End: |
2096 | | */ |