debuggers.hg

view xen/arch/ia64/xen/vhpt.c @ 19964:3952eaeb70b0

Introduce and use a per-CPU read-mostly sub-section

Since mixing data that only gets setup once and then (perhaps
frequently) gets read by remote CPUs with data that the local CPU may
modify (again, perhaps frequently) still causes undesirable cache
protocol related bus traffic, separate the former class of objects
from the latter.

These objects converted here are just picked based on their write-once
(or write-very-rarely) properties; perhaps some more adjustments may
be desirable subsequently. The primary users of the new sub-section
will result from the next patch.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jul 13 11:32:41 2009 +0100 (2009-07-13)
parents 5839491bbf20
children
line source
1 /*
2 * Initialize VHPT support.
3 *
4 * Copyright (C) 2004 Hewlett-Packard Co
5 * Dan Magenheimer <dan.magenheimer@hp.com>
6 *
7 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
8 * VA Linux Systems Japan K.K.
9 * per vcpu vhpt support
10 */
11 #include <linux/config.h>
12 #include <linux/kernel.h>
13 #include <linux/init.h>
15 #include <asm/processor.h>
16 #include <asm/system.h>
17 #include <asm/pgalloc.h>
18 #include <asm/page.h>
19 #include <asm/vhpt.h>
20 #include <asm/vcpu.h>
21 #include <asm/vcpumask.h>
22 #include <asm/vmmu.h>
24 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, vhpt_paddr);
25 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, vhpt_pend);
26 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
27 DEFINE_PER_CPU(volatile u32, vhpt_tlbflush_timestamp);
28 #endif
30 static void
31 __vhpt_flush(unsigned long vhpt_maddr, unsigned long vhpt_size_log2)
32 {
33 struct vhpt_lf_entry *v = (struct vhpt_lf_entry*)__va(vhpt_maddr);
34 unsigned long num_entries = 1 << (vhpt_size_log2 - 5);
35 int i;
37 for (i = 0; i < num_entries; i++, v++)
38 v->ti_tag = INVALID_TI_TAG;
39 }
41 void
42 local_vhpt_flush(void)
43 {
44 /* increment flush clock before flush */
45 u32 flush_time = tlbflush_clock_inc_and_return();
46 __vhpt_flush(__ia64_per_cpu_var(vhpt_paddr), VHPT_SIZE_LOG2);
47 /* this must be after flush */
48 tlbflush_update_time(&__get_cpu_var(vhpt_tlbflush_timestamp),
49 flush_time);
50 perfc_incr(local_vhpt_flush);
51 }
53 void
54 vcpu_vhpt_flush(struct vcpu* v)
55 {
56 unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
57 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
58 if (HAS_PERVCPU_VHPT(v->domain))
59 vhpt_size_log2 = v->arch.pta.size;
60 #endif
61 __vhpt_flush(vcpu_vhpt_maddr(v), vhpt_size_log2);
62 perfc_incr(vcpu_vhpt_flush);
63 }
65 static void
66 vhpt_erase(unsigned long vhpt_maddr, unsigned long vhpt_size_log2)
67 {
68 struct vhpt_lf_entry *v = (struct vhpt_lf_entry*)__va(vhpt_maddr);
69 unsigned long num_entries = 1 << (vhpt_size_log2 - 5);
70 int i;
72 for (i = 0; i < num_entries; i++, v++) {
73 v->itir = 0;
74 v->CChain = 0;
75 v->page_flags = 0;
76 v->ti_tag = INVALID_TI_TAG;
77 }
78 // initialize cache too???
79 }
81 void vhpt_insert (unsigned long vadr, unsigned long pte, unsigned long itir)
82 {
83 struct vhpt_lf_entry *vlfe = (struct vhpt_lf_entry *)ia64_thash(vadr);
84 unsigned long tag = ia64_ttag (vadr);
86 /* Even though VHPT is per VCPU, still need to first disable the entry,
87 * because the processor may support speculative VHPT walk. */
88 vlfe->ti_tag = INVALID_TI_TAG;
89 wmb();
90 vlfe->itir = itir;
91 vlfe->page_flags = pte | _PAGE_P;
92 *(volatile unsigned long*)&vlfe->ti_tag = tag;
93 }
95 void vhpt_multiple_insert(unsigned long vaddr, unsigned long pte,
96 unsigned long itir)
97 {
98 unsigned char ps = current->arch.vhpt_pg_shift;
99 ia64_itir_t _itir = {.itir = itir};
100 unsigned long mask = (1L << _itir.ps) - 1;
101 int i;
103 if (_itir.ps - ps > 10 && !running_on_sim) {
104 // if this happens, we may want to revisit this algorithm
105 panic("vhpt_multiple_insert:logps-PAGE_SHIFT>10,spinning..\n");
106 }
107 if (_itir.ps - ps > 2) {
108 // FIXME: Should add counter here to see how often this
109 // happens (e.g. for 16MB pages!) and determine if it
110 // is a performance problem. On a quick look, it takes
111 // about 39000 instrs for a 16MB page and it seems to occur
112 // only a few times/second, so OK for now.
113 // An alternate solution would be to just insert the one
114 // 16KB in the vhpt (but with the full mapping)?
115 //printk("vhpt_multiple_insert: logps-PAGE_SHIFT==%d,"
116 //"va=%p, pa=%p, pa-masked=%p\n",
117 //logps-PAGE_SHIFT,vaddr,pte&_PFN_MASK,
118 //(pte&_PFN_MASK)&~mask);
119 }
120 vaddr &= ~mask;
121 pte = ((pte & _PFN_MASK) & ~mask) | (pte & ~_PFN_MASK);
122 for (i = 1L << (_itir.ps - ps); i > 0; i--) {
123 vhpt_insert(vaddr, pte, _itir.itir);
124 vaddr += (1L << ps);
125 }
126 }
128 void __init vhpt_init(void)
129 {
130 unsigned long paddr;
131 struct page_info *page;
132 #if !VHPT_ENABLED
133 return;
134 #endif
135 /* This allocation only holds true if vhpt table is unique for
136 * all domains. Or else later new vhpt table should be allocated
137 * from domain heap when each domain is created. Assume xen buddy
138 * allocator can provide natural aligned page by order?
139 */
140 page = alloc_domheap_pages(NULL, VHPT_SIZE_LOG2 - PAGE_SHIFT, 0);
141 if (!page)
142 panic("vhpt_init: can't allocate VHPT!\n");
143 paddr = page_to_maddr(page);
144 if (paddr & ((1 << VHPT_SIZE_LOG2) - 1))
145 panic("vhpt_init: bad VHPT alignment!\n");
146 __get_cpu_var(vhpt_paddr) = paddr;
147 __get_cpu_var(vhpt_pend) = paddr + (1 << VHPT_SIZE_LOG2) - 1;
148 printk(XENLOG_DEBUG "vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n",
149 paddr, __get_cpu_var(vhpt_pend));
150 vhpt_erase(paddr, VHPT_SIZE_LOG2);
151 // we don't enable VHPT here.
152 // context_switch() or schedule_tail() does it.
153 }
155 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
156 void
157 domain_set_vhpt_size(struct domain *d, int8_t vhpt_size_log2)
158 {
159 if (vhpt_size_log2 == -1) {
160 d->arch.has_pervcpu_vhpt = 0;
161 printk(XENLOG_INFO "XEN_DOMCTL_arch_setup: "
162 "domain %d VHPT is global.\n", d->domain_id);
163 } else {
164 d->arch.has_pervcpu_vhpt = 1;
165 d->arch.vhpt_size_log2 = vhpt_size_log2;
166 printk(XENLOG_INFO "XEN_DOMCTL_arch_setup: "
167 "domain %d VHPT is per vcpu. size=2**%d\n",
168 d->domain_id, vhpt_size_log2);
169 }
170 }
172 int
173 pervcpu_vhpt_alloc(struct vcpu *v)
174 {
175 unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
177 if (v->domain->arch.vhpt_size_log2 > 0)
178 vhpt_size_log2 =
179 canonicalize_vhpt_size(v->domain->arch.vhpt_size_log2);
180 printk(XENLOG_DEBUG "%s vhpt_size_log2=%ld\n",
181 __func__, vhpt_size_log2);
182 v->arch.vhpt_entries =
183 (1UL << vhpt_size_log2) / sizeof(struct vhpt_lf_entry);
184 v->arch.vhpt_page =
185 alloc_domheap_pages(NULL, vhpt_size_log2 - PAGE_SHIFT, 0);
186 if (!v->arch.vhpt_page)
187 return -ENOMEM;
189 v->arch.vhpt_maddr = page_to_maddr(v->arch.vhpt_page);
190 if (v->arch.vhpt_maddr & ((1 << VHPT_SIZE_LOG2) - 1))
191 panic("pervcpu_vhpt_init: bad VHPT alignment!\n");
193 v->arch.pta.val = 0; // to zero reserved bits
194 v->arch.pta.ve = 1; // enable vhpt
195 v->arch.pta.size = vhpt_size_log2;
196 v->arch.pta.vf = 1; // long format
197 v->arch.pta.base = __va_ul(v->arch.vhpt_maddr) >> 15;
199 vhpt_erase(v->arch.vhpt_maddr, vhpt_size_log2);
200 smp_mb(); // per vcpu vhpt may be used by another physical cpu.
201 return 0;
202 }
204 void
205 pervcpu_vhpt_free(struct vcpu *v)
206 {
207 if (likely(v->arch.vhpt_page != NULL))
208 free_domheap_pages(v->arch.vhpt_page,
209 v->arch.pta.size - PAGE_SHIFT);
210 }
211 #endif
213 void
214 domain_purge_swtc_entries(struct domain *d)
215 {
216 struct vcpu* v;
217 for_each_vcpu(d, v) {
218 if (!v->is_initialised)
219 continue;
221 /* Purge TC entries.
222 FIXME: clear only if match. */
223 vcpu_purge_tr_entry(&PSCBX(v,dtlb));
224 vcpu_purge_tr_entry(&PSCBX(v,itlb));
225 }
226 }
228 void
229 domain_purge_swtc_entries_vcpu_dirty_mask(struct domain* d,
230 vcpumask_t vcpu_dirty_mask)
231 {
232 int vcpu;
234 for_each_vcpu_mask(d, vcpu, vcpu_dirty_mask) {
235 struct vcpu* v = d->vcpu[vcpu];
236 if (!v->is_initialised)
237 continue;
239 /* Purge TC entries.
240 FIXME: clear only if match. */
241 vcpu_purge_tr_entry(&PSCBX(v, dtlb));
242 vcpu_purge_tr_entry(&PSCBX(v, itlb));
243 }
244 }
246 // SMP: we can't assume v == current, vcpu might move to another physical cpu.
247 // So memory barrier is necessary.
248 // if we can guranttee that vcpu can run on only this physical cpu
249 // (e.g. vcpu == current), smp_mb() is unnecessary.
250 void vcpu_flush_vtlb_all(struct vcpu *v)
251 {
252 /* First VCPU tlb. */
253 vcpu_purge_tr_entry(&PSCBX(v,dtlb));
254 vcpu_purge_tr_entry(&PSCBX(v,itlb));
255 smp_mb();
257 /* Then VHPT. */
258 if (HAS_PERVCPU_VHPT(v->domain))
259 vcpu_vhpt_flush(v);
260 else
261 local_vhpt_flush();
262 smp_mb();
264 /* Then mTLB. */
265 local_flush_tlb_all();
267 /* We could clear bit in d->domain_dirty_cpumask only if domain d in
268 not running on this processor. There is currently no easy way to
269 check this. */
271 perfc_incr(vcpu_flush_vtlb_all);
272 }
274 static void __vcpu_flush_vtlb_all(void *vcpu)
275 {
276 vcpu_flush_vtlb_all((struct vcpu*)vcpu);
277 }
279 // caller must incremented reference count to d somehow.
280 void domain_flush_vtlb_all(struct domain* d)
281 {
282 int cpu = smp_processor_id ();
283 struct vcpu *v;
285 for_each_vcpu(d, v) {
286 if (!v->is_initialised)
287 continue;
289 if (VMX_DOMAIN(v)) {
290 // This code may be called for remapping shared_info
291 // and grant_table from guest_physmap_remove_page()
292 // in arch_memory_op() XENMEM_add_to_physmap to realize
293 // PV-on-HVM feature.
294 vmx_vcpu_flush_vtlb_all(v);
295 continue;
296 }
298 if (v->processor == cpu)
299 vcpu_flush_vtlb_all(v);
300 else
301 // SMP: it is racy to reference v->processor.
302 // vcpu scheduler may move this vcpu to another
303 // physicall processor, and change the value
304 // using plain store.
305 // We may be seeing the old value of it.
306 // In such case, flush_vtlb_for_context_switch()
307 // takes care of mTLB flush.
308 smp_call_function_single(v->processor,
309 __vcpu_flush_vtlb_all,
310 v, 1);
311 }
312 perfc_incr(domain_flush_vtlb_all);
313 }
315 // Callers may need to call smp_mb() before/after calling this.
316 // Be carefull.
317 static void
318 __flush_vhpt_range(unsigned long vhpt_maddr, u64 vadr, u64 addr_range)
319 {
320 void *vhpt_base = __va(vhpt_maddr);
321 u64 pgsz = 1L << current->arch.vhpt_pg_shift;
322 u64 purge_addr = vadr & PAGE_MASK;
324 addr_range += vadr - purge_addr;
325 addr_range = PAGE_ALIGN(addr_range);
326 while ((long)addr_range > 0) {
327 /* Get the VHPT entry. */
328 unsigned int off = ia64_thash(purge_addr) -
329 __va_ul(vcpu_vhpt_maddr(current));
330 struct vhpt_lf_entry *v = vhpt_base + off;
331 v->ti_tag = INVALID_TI_TAG;
332 addr_range -= pgsz;
333 purge_addr += pgsz;
334 }
335 }
337 static void
338 cpu_flush_vhpt_range(int cpu, u64 vadr, u64 addr_range)
339 {
340 __flush_vhpt_range(per_cpu(vhpt_paddr, cpu), vadr, addr_range);
341 }
343 static void
344 vcpu_flush_vhpt_range(struct vcpu* v, u64 vadr, u64 addr_range)
345 {
346 __flush_vhpt_range(vcpu_vhpt_maddr(v), vadr, addr_range);
347 }
349 void vcpu_flush_tlb_vhpt_range (u64 vadr, u64 log_range)
350 {
351 if (HAS_PERVCPU_VHPT(current->domain))
352 vcpu_flush_vhpt_range(current, vadr, 1UL << log_range);
353 else
354 cpu_flush_vhpt_range(current->processor,
355 vadr, 1UL << log_range);
356 ia64_ptcl(vadr, log_range << 2);
357 ia64_srlz_i();
358 perfc_incr(vcpu_flush_tlb_vhpt_range);
359 }
361 void domain_flush_vtlb_range (struct domain *d, u64 vadr, u64 addr_range)
362 {
363 struct vcpu *v;
365 #if 0
366 // this only seems to occur at shutdown, but it does occur
367 if ((!addr_range) || addr_range & (addr_range - 1)) {
368 printk("vhpt_flush_address: weird range, spinning...\n");
369 while(1);
370 }
371 #endif
373 domain_purge_swtc_entries(d);
374 smp_mb();
376 for_each_vcpu (d, v) {
377 if (!v->is_initialised)
378 continue;
380 if (HAS_PERVCPU_VHPT(d)) {
381 vcpu_flush_vhpt_range(v, vadr, addr_range);
382 } else {
383 // SMP: it is racy to reference v->processor.
384 // vcpu scheduler may move this vcpu to another
385 // physicall processor, and change the value
386 // using plain store.
387 // We may be seeing the old value of it.
388 // In such case, flush_vtlb_for_context_switch()
389 /* Invalidate VHPT entries. */
390 cpu_flush_vhpt_range(v->processor, vadr, addr_range);
391 }
392 }
393 // ptc.ga has release semantics.
395 /* ptc.ga */
396 platform_global_tlb_purge(vadr, vadr + addr_range,
397 current->arch.vhpt_pg_shift);
398 perfc_incr(domain_flush_vtlb_range);
399 }
401 #ifdef CONFIG_XEN_IA64_TLB_TRACK
402 #include <asm/tlb_track.h>
403 #include <asm/vmx_vcpu.h>
404 void
405 __domain_flush_vtlb_track_entry(struct domain* d,
406 const struct tlb_track_entry* entry)
407 {
408 unsigned long rr7_rid;
409 int swap_rr0 = 0;
410 unsigned long old_rid;
411 unsigned long vaddr = entry->vaddr;
412 struct vcpu* v;
413 int cpu;
414 int vcpu;
415 int local_purge = 1;
417 /* tlb inert tracking is done in PAGE_SIZE uint. */
418 unsigned char ps = max_t(unsigned char,
419 current->arch.vhpt_pg_shift, PAGE_SHIFT);
420 /* This case isn't supported (yet). */
421 BUG_ON(current->arch.vhpt_pg_shift > PAGE_SHIFT);
423 BUG_ON((vaddr >> VRN_SHIFT) != VRN7);
424 /*
425 * heuristic:
426 * dom0linux accesses grant mapped pages via the kernel
427 * straight mapped area and it doesn't change rr7 rid.
428 * So it is likey that rr7 == entry->rid so that
429 * we can avoid rid change.
430 * When blktap is supported, this heuristic should be revised.
431 */
432 vcpu_get_rr(current, VRN7 << VRN_SHIFT, &rr7_rid);
433 if (likely(rr7_rid == entry->rid)) {
434 perfc_incr(tlb_track_use_rr7);
435 } else {
436 swap_rr0 = 1;
437 vaddr = (vaddr << 3) >> 3;// force vrn0
438 perfc_incr(tlb_track_swap_rr0);
439 }
441 // tlb_track_entry_printf(entry);
442 if (swap_rr0) {
443 vcpu_get_rr(current, 0, &old_rid);
444 vcpu_set_rr(current, 0, entry->rid);
445 }
447 if (HAS_PERVCPU_VHPT(d)) {
448 for_each_vcpu_mask(d, vcpu, entry->vcpu_dirty_mask) {
449 v = d->vcpu[vcpu];
450 if (!v->is_initialised)
451 continue;
453 /* Invalidate VHPT entries. */
454 vcpu_flush_vhpt_range(v, vaddr, 1L << ps);
456 /*
457 * current->processor == v->processor
458 * is racy. we may see old v->processor and
459 * a new physical processor of v might see old
460 * vhpt entry and insert tlb.
461 */
462 if (v != current)
463 local_purge = 0;
464 }
465 } else {
466 for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
467 /* Invalidate VHPT entries. */
468 cpu_flush_vhpt_range(cpu, vaddr, 1L << ps);
470 if (d->vcpu[cpu] != current)
471 local_purge = 0;
472 }
473 }
475 /* ptc.ga */
476 if (local_purge) {
477 ia64_ptcl(vaddr, ps << 2);
478 perfc_incr(domain_flush_vtlb_local);
479 } else {
480 /* ptc.ga has release semantics. */
481 platform_global_tlb_purge(vaddr, vaddr + (1L << ps), ps);
482 perfc_incr(domain_flush_vtlb_global);
483 }
485 if (swap_rr0) {
486 vcpu_set_rr(current, 0, old_rid);
487 }
488 perfc_incr(domain_flush_vtlb_track_entry);
489 }
491 void
492 domain_flush_vtlb_track_entry(struct domain* d,
493 const struct tlb_track_entry* entry)
494 {
495 domain_purge_swtc_entries_vcpu_dirty_mask(d, entry->vcpu_dirty_mask);
496 smp_mb();
498 __domain_flush_vtlb_track_entry(d, entry);
499 }
501 #endif
503 static void flush_tlb_vhpt_all (struct domain *d)
504 {
505 /* First VHPT. */
506 local_vhpt_flush ();
508 /* Then mTLB. */
509 local_flush_tlb_all ();
510 }
512 void domain_flush_tlb_vhpt(struct domain *d)
513 {
514 /* Very heavy... */
515 if (HAS_PERVCPU_VHPT(d) || is_hvm_domain(d))
516 on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1);
517 else
518 on_each_cpu((void (*)(void *))flush_tlb_vhpt_all, d, 1);
519 cpus_clear (d->domain_dirty_cpumask);
520 }
522 void flush_tlb_for_log_dirty(struct domain *d)
523 {
524 struct vcpu *v;
526 /* NB. There is no race because all vcpus are paused. */
527 if (is_hvm_domain(d)) {
528 for_each_vcpu (d, v) {
529 if (!v->is_initialised)
530 continue;
531 /* XXX: local_flush_tlb_all is called redundantly */
532 thash_purge_all(v);
533 }
534 smp_call_function((void (*)(void *))local_flush_tlb_all,
535 NULL, 1);
536 } else if (HAS_PERVCPU_VHPT(d)) {
537 for_each_vcpu (d, v) {
538 if (!v->is_initialised)
539 continue;
540 vcpu_purge_tr_entry(&PSCBX(v,dtlb));
541 vcpu_purge_tr_entry(&PSCBX(v,itlb));
542 vcpu_vhpt_flush(v);
543 }
544 on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1);
545 } else {
546 on_each_cpu((void (*)(void *))flush_tlb_vhpt_all, d, 1);
547 }
548 cpus_clear (d->domain_dirty_cpumask);
549 }
551 void flush_tlb_mask(const cpumask_t *mask)
552 {
553 int cpu;
555 cpu = smp_processor_id();
556 if (cpu_isset(cpu, *mask))
557 flush_tlb_vhpt_all (NULL);
559 if (cpus_subset(*mask, *cpumask_of(cpu)))
560 return;
562 for_each_cpu_mask (cpu, *mask)
563 if (cpu != smp_processor_id())
564 smp_call_function_single
565 (cpu, (void (*)(void *))flush_tlb_vhpt_all, NULL, 1);
566 }
568 #ifdef PERF_COUNTERS
569 void gather_vhpt_stats(void)
570 {
571 int i, cpu;
573 perfc_set(vhpt_nbr_entries, VHPT_NUM_ENTRIES);
575 for_each_present_cpu (cpu) {
576 struct vhpt_lf_entry *v = __va(per_cpu(vhpt_paddr, cpu));
577 unsigned long vhpt_valid = 0;
579 for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++)
580 if (!(v->ti_tag & INVALID_TI_TAG))
581 vhpt_valid++;
582 per_cpu(perfcounters, cpu)[PERFC_vhpt_valid_entries] = vhpt_valid;
583 }
584 }
585 #endif