debuggers.hg

view xen/arch/x86/cpu/mcheck/mce_intel.c @ 20908:7310235f74f8

x86 mca: Not GP fault when guest write non 0s or 1s to MCA CTL MSRs.

a) For Mci_CTL MSR, Guest can write any value to it. When read back,
it will be ANDed with the physical value. Some bit in physical value
can be 0, either because read-only in hardware (like masked by AMD's
Mci_CTL_MASK), or because Xen didn't enable it.
If guest write some bit as 0, while that bit is 1 in host, we will
not inject MCE corresponding that bank to guest, as we can't
distinguish if the MCE is caused by the guest-cleared bit.

b) For MCG_CTL MSR, guest can write any value to it. When read back,
it will be ANDed with the physical value.
If guest does not write all 1s. In mca_ctl_conflict(), we simply
not inject any vMCE to guest if some bit is set in physical MSR
while is cleared in guest 's vMCG_CTL MSR.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jan 29 06:48:00 2010 +0000 (2010-01-29)
parents 257bd5e90294
children 805eae786b50
line source
1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/smp.h>
8 #include <xen/mm.h>
9 #include <asm/processor.h>
10 #include <public/sysctl.h>
11 #include <asm/system.h>
12 #include <asm/msr.h>
13 #include <asm/p2m.h>
14 #include "mce.h"
15 #include "x86_mca.h"
17 DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
18 DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
19 int cmci_support = 0;
20 int ser_support = 0;
22 static int nr_intel_ext_msrs = 0;
24 /* Below are for MCE handling */
25 struct mce_softirq_barrier {
26 atomic_t val;
27 atomic_t ingen;
28 atomic_t outgen;
29 };
31 static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar;
32 static struct mce_softirq_barrier mce_trap_bar;
34 /*
35 * mce_logout_lock should only be used in the trap handler,
36 * while MCIP has not been cleared yet in the global status
37 * register. Other use is not safe, since an MCE trap can
38 * happen at any moment, which would cause lock recursion.
39 */
40 static DEFINE_SPINLOCK(mce_logout_lock);
42 static atomic_t severity_cpu = ATOMIC_INIT(-1);
43 static atomic_t found_error = ATOMIC_INIT(0);
45 static void mce_barrier_enter(struct mce_softirq_barrier *);
46 static void mce_barrier_exit(struct mce_softirq_barrier *);
48 #ifdef CONFIG_X86_MCE_THERMAL
49 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
50 {
51 printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
52 smp_processor_id());
53 add_taint(TAINT_MACHINE_CHECK);
54 }
56 /* P4/Xeon Thermal transition interrupt handler */
57 static void intel_thermal_interrupt(struct cpu_user_regs *regs)
58 {
59 u32 l, h;
60 unsigned int cpu = smp_processor_id();
61 static s_time_t next[NR_CPUS];
63 ack_APIC_irq();
64 if (NOW() < next[cpu])
65 return;
67 next[cpu] = NOW() + MILLISECS(5000);
68 rdmsr(MSR_IA32_THERM_STATUS, l, h);
69 if (l & 0x1) {
70 printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
71 printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
72 cpu);
73 add_taint(TAINT_MACHINE_CHECK);
74 } else {
75 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
76 }
77 }
79 /* Thermal interrupt handler for this CPU setup */
80 static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs)
81 = unexpected_thermal_interrupt;
83 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
84 {
85 struct cpu_user_regs *old_regs = set_irq_regs(regs);
86 irq_enter();
87 vendor_thermal_interrupt(regs);
88 irq_exit();
89 set_irq_regs(old_regs);
90 }
92 /* P4/Xeon Thermal regulation detect and init */
93 static void intel_init_thermal(struct cpuinfo_x86 *c)
94 {
95 u32 l, h;
96 int tm2 = 0;
97 unsigned int cpu = smp_processor_id();
99 /* Thermal monitoring */
100 if (!cpu_has(c, X86_FEATURE_ACPI))
101 return; /* -ENODEV */
103 /* Clock modulation */
104 if (!cpu_has(c, X86_FEATURE_ACC))
105 return; /* -ENODEV */
107 /* first check if its enabled already, in which case there might
108 * be some SMM goo which handles it, so we can't even put a handler
109 * since it might be delivered via SMI already -zwanem.
110 */
111 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
112 h = apic_read(APIC_LVTTHMR);
113 if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
114 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
115 return; /* -EBUSY */
116 }
118 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
119 tm2 = 1;
121 /* check whether a vector already exists, temporarily masked? */
122 if (h & APIC_VECTOR_MASK) {
123 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
124 cpu, (h & APIC_VECTOR_MASK));
125 return; /* -EBUSY */
126 }
128 /* The temperature transition interrupt handler setup */
129 h = THERMAL_APIC_VECTOR; /* our delivery vector */
130 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
131 apic_write_around(APIC_LVTTHMR, h);
133 rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
134 wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
136 /* ok we're good to go... */
137 vendor_thermal_interrupt = intel_thermal_interrupt;
139 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
140 wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
142 l = apic_read (APIC_LVTTHMR);
143 apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
144 printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
145 cpu, tm2 ? "TM2" : "TM1");
146 return;
147 }
148 #endif /* CONFIG_X86_MCE_THERMAL */
150 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
151 {
152 if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
153 && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs ) {
154 ext->mc_msr[ext->mc_msrs].reg = msr;
155 mca_rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
156 ++ext->mc_msrs;
157 }
158 }
160 static enum mca_extinfo
161 intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
162 {
163 struct mcinfo_extended mc_ext;
165 if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
166 return MCA_EXTINFO_IGNORED;
168 /* this function will called when CAP(9).MCG_EXT_P = 1 */
169 memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
170 mc_ext.common.type = MC_TYPE_EXTENDED;
171 mc_ext.common.size = sizeof(mc_ext);
173 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EAX);
174 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EBX);
175 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ECX);
176 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EDX);
177 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ESI);
178 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EDI);
179 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EBP);
180 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ESP);
181 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EFLAGS);
182 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EIP);
183 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_MISC);
185 #ifdef __x86_64__
186 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R8);
187 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R9);
188 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R10);
189 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R11);
190 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R12);
191 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R13);
192 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R14);
193 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R15);
194 #endif
196 x86_mcinfo_add(mci, &mc_ext);
198 return MCA_EXTINFO_GLOBAL;
199 }
201 /* This node list records errors impacting a domain. when one
202 * MCE# happens, one error bank impacts a domain. This error node
203 * will be inserted to the tail of the per_dom data for vMCE# MSR
204 * virtualization. When one vMCE# injection is finished processing
205 * processed by guest, the corresponding node will be deleted.
206 * This node list is for GUEST vMCE# MSRS virtualization.
207 */
208 static struct bank_entry* alloc_bank_entry(void) {
209 struct bank_entry *entry;
211 entry = xmalloc(struct bank_entry);
212 if (!entry) {
213 printk(KERN_ERR "MCE: malloc bank_entry failed\n");
214 return NULL;
215 }
216 memset(entry, 0x0, sizeof(entry));
217 INIT_LIST_HEAD(&entry->list);
218 return entry;
219 }
221 /* Fill error bank info for #vMCE injection and GUEST vMCE#
222 * MSR virtualization data
223 * 1) Log down how many nr_injections of the impacted.
224 * 2) Copy MCE# error bank to impacted DOM node list,
225 for vMCE# MSRs virtualization
226 */
228 static int fill_vmsr_data(struct mcinfo_bank *mc_bank,
229 uint64_t gstatus) {
230 struct domain *d;
231 struct bank_entry *entry;
233 /* This error bank impacts one domain, we need to fill domain related
234 * data for vMCE MSRs virtualization and vMCE# injection */
235 if (mc_bank->mc_domid != (uint16_t)~0) {
236 d = get_domain_by_id(mc_bank->mc_domid);
238 /* Not impact a valid domain, skip this error of the bank */
239 if (!d) {
240 mce_printk(MCE_QUIET, "MCE: Not found valid impacted DOM\n");
241 return 0;
242 }
244 /* For HVM guest, Only when first vMCE is consumed by HVM guest successfully,
245 * will we generete another node and inject another vMCE
246 */
247 if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection > 0) )
248 {
249 mce_printk(MCE_QUIET, "MCE: HVM guest has not handled previous"
250 " vMCE yet!\n");
251 return -1;
252 }
253 entry = alloc_bank_entry();
254 if (entry == NULL)
255 return -1;
257 entry->mci_status = mc_bank->mc_status;
258 entry->mci_addr = mc_bank->mc_addr;
259 entry->mci_misc = mc_bank->mc_misc;
260 entry->bank = mc_bank->mc_bank;
262 spin_lock(&d->arch.vmca_msrs.lock);
263 /* New error Node, insert to the tail of the per_dom data */
264 list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
265 /* Fill MSR global status */
266 d->arch.vmca_msrs.mcg_status = gstatus;
267 /* New node impact the domain, need another vMCE# injection*/
268 d->arch.vmca_msrs.nr_injection++;
269 spin_unlock(&d->arch.vmca_msrs.lock);
271 mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d "
272 "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
273 mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr,
274 mc_bank->mc_domid);
275 }
276 return 0;
277 }
279 static int inject_mce(struct domain *d)
280 {
281 int cpu = smp_processor_id();
282 cpumask_t affinity;
284 /* PV guest and HVM guest have different vMCE# injection
285 * methods*/
287 if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
288 {
289 if (d->is_hvm)
290 {
291 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM DOM %d\n",
292 d->domain_id);
293 vcpu_kick(d->vcpu[0]);
294 }
295 /* PV guest including DOM0 */
296 else
297 {
298 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
299 d->domain_id);
300 if (guest_has_trap_callback
301 (d, 0, TRAP_machine_check))
302 {
303 d->vcpu[0]->cpu_affinity_tmp =
304 d->vcpu[0]->cpu_affinity;
305 cpus_clear(affinity);
306 cpu_set(cpu, affinity);
307 mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity, old %d\n", cpu,
308 d->vcpu[0]->processor);
309 vcpu_set_affinity(d->vcpu[0], &affinity);
310 vcpu_kick(d->vcpu[0]);
311 }
312 else
313 {
314 mce_printk(MCE_VERBOSE, "MCE: Kill PV guest with No MCE handler\n");
315 domain_crash(d);
316 }
317 }
318 }
319 else {
320 /* new vMCE comes while first one has not been injected yet,
321 * in this case, inject fail. [We can't lose this vMCE for
322 * the mce node's consistency].
323 */
324 mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected "
325 " to this DOM%d!\n", d->domain_id);
326 return -1;
327 }
328 return 0;
329 }
331 static void intel_UCR_handler(struct mcinfo_bank *bank,
332 struct mcinfo_global *global,
333 struct mcinfo_extended *extension,
334 struct mca_handle_result *result)
335 {
336 struct domain *d;
337 unsigned long mfn, gfn;
338 uint32_t status;
340 mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
341 result->result = MCA_NEED_RESET;
342 if (bank->mc_addr != 0) {
343 mfn = bank->mc_addr >> PAGE_SHIFT;
344 if (!offline_page(mfn, 1, &status)) {
345 /* This is free page */
346 if (status & PG_OFFLINE_OFFLINED)
347 result->result = MCA_RECOVERED;
348 else if (status & PG_OFFLINE_PENDING) {
349 /* This page has owner */
350 if (status & PG_OFFLINE_OWNED) {
351 result->result |= MCA_OWNER;
352 result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
353 mce_printk(MCE_QUIET, "MCE: This error page is ownded"
354 " by DOM %d\n", result->owner);
355 /* Fill vMCE# injection and vMCE# MSR virtualization "
356 * "related data */
357 bank->mc_domid = result->owner;
358 /* XXX: Cannot handle shared pages yet
359 * (this should identify all domains and gfn mapping to
360 * the mfn in question) */
361 BUG_ON( result->owner == DOMID_COW );
362 if ( result->owner != DOMID_XEN ) {
364 d = get_domain_by_id(result->owner);
365 if ( mca_ctl_conflict(bank, d) )
366 {
367 /* Guest has different MCE ctl with hypervisor */
368 put_domain(d);
369 return;
370 }
372 gfn =
373 mfn_to_gmfn(d, ((bank->mc_addr) >> PAGE_SHIFT));
374 bank->mc_addr =
375 gfn << PAGE_SHIFT | (bank->mc_addr & PAGE_MASK);
376 if (fill_vmsr_data(bank, global->mc_gstatus) == -1)
377 {
378 mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
379 "failed\n", result->owner);
380 domain_crash(d);
381 return;
382 }
383 /* We will inject vMCE to DOMU*/
384 if ( inject_mce(d) < 0 )
385 {
386 mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
387 " failed\n", d->domain_id);
388 domain_crash(d);
389 return;
390 }
391 /* Impacted domain go on with domain's recovery job
392 * if the domain has its own MCA handler.
393 * For xen, it has contained the error and finished
394 * its own recovery job.
395 */
396 result->result = MCA_RECOVERED;
397 }
398 }
399 }
400 }
401 }
402 }
404 #define INTEL_MAX_RECOVERY 2
405 struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] =
406 {{0x017A, intel_UCR_handler}, {0x00C0, intel_UCR_handler}};
408 /*
409 * Called from mctelem_process_deferred. Return 1 if the telemetry
410 * should be committed for dom0 consumption, 0 if it should be
411 * dismissed.
412 */
413 static int mce_action(mctelem_cookie_t mctc)
414 {
415 struct mc_info *local_mi;
416 uint32_t i;
417 struct mcinfo_common *mic = NULL;
418 struct mcinfo_global *mc_global;
419 struct mcinfo_bank *mc_bank;
420 struct mca_handle_result mca_res;
422 local_mi = (struct mc_info*)mctelem_dataptr(mctc);
423 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
424 if (mic == NULL) {
425 printk(KERN_ERR "MCE: get local buffer entry failed\n ");
426 return 0;
427 }
429 mc_global = (struct mcinfo_global *)mic;
431 /* Processing bank information */
432 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
434 for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
435 if (mic->type != MC_TYPE_BANK) {
436 continue;
437 }
438 mc_bank = (struct mcinfo_bank*)mic;
440 /* TODO: Add recovery actions here, such as page-offline, etc */
441 memset(&mca_res, 0x0f, sizeof(mca_res));
442 for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
443 if ( ((mc_bank->mc_status & 0xffff) ==
444 intel_recovery_handler[i].mca_code) ||
445 ((mc_bank->mc_status & 0xfff0) ==
446 intel_recovery_handler[i].mca_code)) {
447 /* For SRAR, OVER = 1 should have caused reset
448 * For SRAO, OVER = 1 skip recovery action, continue execution
449 */
450 if (!(mc_bank->mc_status & MCi_STATUS_OVER))
451 intel_recovery_handler[i].recovery_handler
452 (mc_bank, mc_global, NULL, &mca_res);
453 else {
454 if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
455 mca_res.result = MCA_NEED_RESET;
456 else
457 mca_res.result = MCA_NO_ACTION;
458 }
459 if (mca_res.result & MCA_OWNER)
460 mc_bank->mc_domid = mca_res.owner;
461 if (mca_res.result == MCA_NEED_RESET)
462 /* DOMID_XEN*/
463 mc_panic("MCE: Software recovery failed for the UCR "
464 "error\n");
465 else if (mca_res.result == MCA_RECOVERED)
466 mce_printk(MCE_VERBOSE, "MCE: The UCR error is"
467 "successfully recovered by software!\n");
468 else if (mca_res.result == MCA_NO_ACTION)
469 mce_printk(MCE_VERBOSE, "MCE: Overwrite SRAO error can't"
470 "do recover action, RIPV=1, let it be.\n");
471 break;
472 }
473 }
474 /* For SRAR, no defined recovery action should have caused reset
475 * in MCA Handler
476 */
477 if ( i >= INTEL_MAX_RECOVERY )
478 mce_printk(MCE_VERBOSE, "MCE: No software recovery action"
479 " found for this SRAO error\n");
481 }
482 return 1;
483 }
485 /* Softirq Handler for this MCE# processing */
486 static void mce_softirq(void)
487 {
488 int cpu = smp_processor_id();
489 unsigned int workcpu;
491 mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
493 mce_barrier_enter(&mce_inside_bar);
495 /*
496 * Everybody is here. Now let's see who gets to do the
497 * recovery work. Right now we just see if there's a CPU
498 * that did not have any problems, and pick that one.
499 *
500 * First, just set a default value: the last CPU who reaches this
501 * will overwrite the value and become the default.
502 */
504 atomic_set(&severity_cpu, cpu);
506 mce_barrier_enter(&mce_severity_bar);
507 if (!mctelem_has_deferred(cpu))
508 atomic_set(&severity_cpu, cpu);
509 mce_barrier_exit(&mce_severity_bar);
511 /* We choose severity_cpu for further processing */
512 if (atomic_read(&severity_cpu) == cpu) {
514 mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
516 /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
517 * vMCE MSRs virtualization buffer
518 */
519 for_each_online_cpu(workcpu) {
520 mctelem_process_deferred(workcpu, mce_action);
521 }
523 /* Step2: Send Log to DOM0 through vIRQ */
524 if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
525 mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
526 send_guest_global_virq(dom0, VIRQ_MCA);
527 }
528 }
530 mce_barrier_exit(&mce_inside_bar);
531 }
533 /* Machine Check owner judge algorithm:
534 * When error happens, all cpus serially read its msr banks.
535 * The first CPU who fetches the error bank's info will clear
536 * this bank. Later readers can't get any infor again.
537 * The first CPU is the actual mce_owner
538 *
539 * For Fatal (pcc=1) error, it might cause machine crash
540 * before we're able to log. For avoiding log missing, we adopt two
541 * round scanning:
542 * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
543 * All MCE banks are sticky, when boot up, MCE polling mechanism
544 * will help to collect and log those MCE errors.
545 * Round2: Do all MCE processing logic as normal.
546 */
548 static void mce_panic_check(void)
549 {
550 if (is_mc_panic) {
551 local_irq_enable();
552 for ( ; ; )
553 halt();
554 }
555 }
557 /*
558 * Initialize a barrier. Just set it to 0.
559 */
560 static void mce_barrier_init(struct mce_softirq_barrier *bar)
561 {
562 atomic_set(&bar->val, 0);
563 atomic_set(&bar->ingen, 0);
564 atomic_set(&bar->outgen, 0);
565 }
567 #if 0
568 /*
569 * This function will need to be used when offlining a CPU in the
570 * recovery actions.
571 *
572 * Decrement a barrier only. Needed for cases where the CPU
573 * in question can't do it itself (e.g. it is being offlined).
574 */
575 static void mce_barrier_dec(struct mce_softirq_barrier *bar)
576 {
577 atomic_inc(&bar->outgen);
578 wmb();
579 atomic_dec(&bar->val);
580 }
581 #endif
583 static void mce_spin_lock(spinlock_t *lk)
584 {
585 while (!spin_trylock(lk)) {
586 cpu_relax();
587 mce_panic_check();
588 }
589 }
591 static void mce_spin_unlock(spinlock_t *lk)
592 {
593 spin_unlock(lk);
594 }
596 /*
597 * Increment the generation number and the value. The generation number
598 * is incremented when entering a barrier. This way, it can be checked
599 * on exit if a CPU is trying to re-enter the barrier. This can happen
600 * if the first CPU to make it out immediately exits or re-enters, while
601 * another CPU that is still in the loop becomes otherwise occupied
602 * (e.g. it needs to service an interrupt, etc), missing the value
603 * it's waiting for.
604 *
605 * These barrier functions should always be paired, so that the
606 * counter value will reach 0 again after all CPUs have exited.
607 */
608 static void mce_barrier_enter(struct mce_softirq_barrier *bar)
609 {
610 int gen;
612 atomic_inc(&bar->ingen);
613 gen = atomic_read(&bar->outgen);
614 mb();
615 atomic_inc(&bar->val);
616 while ( atomic_read(&bar->val) != num_online_cpus() &&
617 atomic_read(&bar->outgen) == gen) {
618 mb();
619 mce_panic_check();
620 }
621 }
623 static void mce_barrier_exit(struct mce_softirq_barrier *bar)
624 {
625 int gen;
627 atomic_inc(&bar->outgen);
628 gen = atomic_read(&bar->ingen);
629 mb();
630 atomic_dec(&bar->val);
631 while ( atomic_read(&bar->val) != 0 &&
632 atomic_read(&bar->ingen) == gen ) {
633 mb();
634 mce_panic_check();
635 }
636 }
638 #if 0
639 static void mce_barrier(struct mce_softirq_barrier *bar)
640 {
641 mce_barrier_enter(bar);
642 mce_barrier_exit(bar);
643 }
644 #endif
646 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
647 {
648 uint64_t gstatus;
649 mctelem_cookie_t mctc = NULL;
650 struct mca_summary bs;
651 cpu_banks_t clear_bank;
653 mce_spin_lock(&mce_logout_lock);
655 memset( &clear_bank, 0x0, sizeof(cpu_banks_t));
656 mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs, &clear_bank);
658 if (bs.errcnt) {
659 /*
660 * Uncorrected errors must be dealth with in softirq context.
661 */
662 if (bs.uc || bs.pcc) {
663 add_taint(TAINT_MACHINE_CHECK);
664 if (mctc != NULL)
665 mctelem_defer(mctc);
666 /*
667 * For PCC=1 and can't be recovered, context is lost, so reboot now without
668 * clearing the banks, and deal with the telemetry after reboot
669 * (the MSRs are sticky)
670 */
671 if (bs.pcc)
672 mc_panic("State lost due to machine check exception.\n");
673 if (!bs.ripv)
674 mc_panic("RIPV =0 can't resume execution!\n");
675 if (!bs.recoverable)
676 mc_panic("Machine check exception software recovery fail.\n");
677 } else {
678 if (mctc != NULL)
679 mctelem_commit(mctc);
680 }
681 atomic_set(&found_error, 1);
683 mce_printk(MCE_VERBOSE, "MCE: clear_bank map %lx on CPU%d\n",
684 *((unsigned long*)clear_bank), smp_processor_id());
685 mcheck_mca_clearbanks(clear_bank);
686 /* Print MCE error */
687 x86_mcinfo_dump(mctelem_dataptr(mctc));
689 } else {
690 if (mctc != NULL)
691 mctelem_dismiss(mctc);
692 }
693 mce_spin_unlock(&mce_logout_lock);
695 /*
696 * Wait until everybody has processed the trap.
697 */
698 mce_barrier_enter(&mce_trap_bar);
699 /* According to latest MCA OS writer guide, if no error bank found
700 * on all cpus, something unexpected happening, we can't do any
701 * recovery job but to reset the system.
702 */
703 if (atomic_read(&found_error) == 0)
704 mc_panic("Unexpected condition for the MCE handler, need reset\n");
705 mce_barrier_exit(&mce_trap_bar);
707 /* Clear error finding flags after all cpus finishes above judgement */
708 mce_barrier_enter(&mce_trap_bar);
709 if (atomic_read(&found_error)) {
710 mce_printk(MCE_VERBOSE, "MCE: Choose one CPU "
711 "to clear error finding flag\n ");
712 atomic_set(&found_error, 0);
713 }
714 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
715 if ((gstatus & MCG_STATUS_MCIP) != 0) {
716 mce_printk(MCE_VERBOSE, "MCE: Clear MCIP@ last step");
717 mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
718 }
719 mce_barrier_exit(&mce_trap_bar);
721 raise_softirq(MACHINE_CHECK_SOFTIRQ);
722 }
724 /* According to MCA OS writer guide, CMCI handler need to clear bank when
725 * 1) CE (UC = 0)
726 * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
727 * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
728 * MCA handler need to clear bank when
729 * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
730 * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
731 * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
732 */
734 static int intel_need_clearbank_scan(enum mca_source who, u64 status)
735 {
736 if ( who == MCA_CMCI_HANDLER) {
737 /* CMCI need clear bank */
738 if ( !(status & MCi_STATUS_UC) )
739 return 1;
740 /* Spurious need clear bank */
741 else if ( ser_support && !(status & MCi_STATUS_OVER)
742 && !(status & MCi_STATUS_EN) )
743 return 1;
744 /* UCNA OVER = 0 need clear bank */
745 else if ( ser_support && !(status & MCi_STATUS_OVER)
746 && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
747 && !(status & MCi_STATUS_AR))
748 return 1;
749 /* Only Log, no clear */
750 else return 0;
751 }
752 else if ( who == MCA_MCE_SCAN) {
753 /* Spurious need clear bank */
754 if ( ser_support && !(status & MCi_STATUS_OVER)
755 && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN))
756 return 1;
757 /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
758 else if ( ser_support && (status & MCi_STATUS_UC)
759 && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR )
760 && (status & MCi_STATUS_OVER) )
761 return 1;
762 /* SRAO need clear bank */
763 else if ( ser_support && !(status & MCi_STATUS_AR)
764 && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC))
765 return 1;
766 else
767 return 0;
768 }
770 return 1;
771 }
773 /* MCE continues/is recoverable when
774 * 1) CE UC = 0
775 * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
776 * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
777 * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
778 * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
779 */
780 static int intel_recoverable_scan(u64 status)
781 {
783 if ( !(status & MCi_STATUS_UC ) )
784 return 1;
785 else if ( ser_support && !(status & MCi_STATUS_EN)
786 && !(status & MCi_STATUS_OVER) )
787 return 1;
788 /* SRAR error */
789 else if ( ser_support && !(status & MCi_STATUS_OVER)
790 && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
791 && (status & MCi_STATUS_AR) ) {
792 mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined currently.\n");
793 return 0;
794 }
795 /* SRAO error */
796 else if (ser_support && !(status & MCi_STATUS_PCC)
797 && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
798 && (status & MCi_STATUS_EN))
799 return 1;
800 /* UCNA error */
801 else if (ser_support && !(status & MCi_STATUS_OVER)
802 && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
803 && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR))
804 return 1;
805 return 0;
806 }
808 static DEFINE_SPINLOCK(cmci_discover_lock);
810 /*
811 * Discover bank sharing using the algorithm recommended in the SDM.
812 */
813 static int do_cmci_discover(int i)
814 {
815 unsigned msr = MSR_IA32_MC0_CTL2 + i;
816 u64 val;
818 rdmsrl(msr, val);
819 /* Some other CPU already owns this bank. */
820 if (val & CMCI_EN) {
821 clear_bit(i, __get_cpu_var(mce_banks_owned));
822 goto out;
823 }
824 wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
825 rdmsrl(msr, val);
827 if (!(val & CMCI_EN)) {
828 /* This bank does not support CMCI. Polling timer has to handle it. */
829 set_bit(i, __get_cpu_var(no_cmci_banks));
830 return 0;
831 }
832 set_bit(i, __get_cpu_var(mce_banks_owned));
833 out:
834 clear_bit(i, __get_cpu_var(no_cmci_banks));
835 return 1;
836 }
838 static void cmci_discover(void)
839 {
840 unsigned long flags;
841 int i;
842 mctelem_cookie_t mctc;
843 struct mca_summary bs;
845 mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%d\n", smp_processor_id());
847 spin_lock_irqsave(&cmci_discover_lock, flags);
849 for (i = 0; i < nr_mce_banks; i++)
850 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
851 do_cmci_discover(i);
853 spin_unlock_irqrestore(&cmci_discover_lock, flags);
855 /* In case CMCI happended when do owner change.
856 * If CMCI happened yet not processed immediately,
857 * MCi_status (error_count bit 38~52) is not cleared,
858 * the CMCI interrupt will never be triggered again.
859 */
861 mctc = mcheck_mca_logout(
862 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
864 if (bs.errcnt && mctc != NULL) {
865 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
866 mctelem_commit(mctc);
867 send_guest_global_virq(dom0, VIRQ_MCA);
868 } else {
869 x86_mcinfo_dump(mctelem_dataptr(mctc));
870 mctelem_dismiss(mctc);
871 }
872 } else if (mctc != NULL)
873 mctelem_dismiss(mctc);
875 mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
876 smp_processor_id(),
877 *((unsigned long *)__get_cpu_var(mce_banks_owned)),
878 *((unsigned long *)__get_cpu_var(no_cmci_banks)));
879 }
881 /*
882 * Define an owner for each bank. Banks can be shared between CPUs
883 * and to avoid reporting events multiple times always set up one
884 * CPU as owner.
885 *
886 * The assignment has to be redone when CPUs go offline and
887 * any of the owners goes away. Also pollers run in parallel so we
888 * have to be careful to update the banks in a way that doesn't
889 * lose or duplicate events.
890 */
892 static void mce_set_owner(void)
893 {
894 if (!cmci_support || mce_disabled == 1)
895 return;
897 cmci_discover();
898 }
900 static void __cpu_mcheck_distribute_cmci(void *unused)
901 {
902 cmci_discover();
903 }
905 void cpu_mcheck_distribute_cmci(void)
906 {
907 if (cmci_support && !mce_disabled)
908 on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
909 }
911 static void clear_cmci(void)
912 {
913 int i;
915 if (!cmci_support || mce_disabled == 1)
916 return;
918 mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%d\n",
919 smp_processor_id());
921 for (i = 0; i < nr_mce_banks; i++) {
922 unsigned msr = MSR_IA32_MC0_CTL2 + i;
923 u64 val;
924 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
925 continue;
926 rdmsrl(msr, val);
927 if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
928 wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
929 clear_bit(i, __get_cpu_var(mce_banks_owned));
930 }
931 }
933 void cpu_mcheck_disable(void)
934 {
935 clear_in_cr4(X86_CR4_MCE);
937 if (cmci_support && !mce_disabled)
938 clear_cmci();
939 }
941 static void intel_init_cmci(struct cpuinfo_x86 *c)
942 {
943 u32 l, apic;
944 int cpu = smp_processor_id();
946 if (!mce_available(c) || !cmci_support) {
947 mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
948 return;
949 }
951 apic = apic_read(APIC_CMCI);
952 if ( apic & APIC_VECTOR_MASK )
953 {
954 mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
955 cpu, ( apic & APIC_VECTOR_MASK ));
956 return;
957 }
959 apic = CMCI_APIC_VECTOR;
960 apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
961 apic_write_around(APIC_CMCI, apic);
963 l = apic_read(APIC_CMCI);
964 apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
965 }
967 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
968 {
969 mctelem_cookie_t mctc;
970 struct mca_summary bs;
971 struct cpu_user_regs *old_regs = set_irq_regs(regs);
973 ack_APIC_irq();
974 irq_enter();
976 mctc = mcheck_mca_logout(
977 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
979 if (bs.errcnt && mctc != NULL) {
980 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
981 mctelem_commit(mctc);
982 mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
983 send_guest_global_virq(dom0, VIRQ_MCA);
984 } else {
985 x86_mcinfo_dump(mctelem_dataptr(mctc));
986 mctelem_dismiss(mctc);
987 }
988 } else if (mctc != NULL)
989 mctelem_dismiss(mctc);
991 irq_exit();
992 set_irq_regs(old_regs);
993 }
995 void mce_intel_feature_init(struct cpuinfo_x86 *c)
996 {
998 #ifdef CONFIG_X86_MCE_THERMAL
999 intel_init_thermal(c);
1000 #endif
1001 intel_init_cmci(c);
1004 static void _mce_cap_init(struct cpuinfo_x86 *c)
1006 u32 l = mce_cap_init();
1008 if ((l & MCG_CMCI_P) && cpu_has_apic)
1009 cmci_support = 1;
1011 /* Support Software Error Recovery */
1012 if (l & MCG_SER_P)
1013 ser_support = 1;
1015 if (l & MCG_EXT_P)
1017 nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff;
1018 mce_printk (MCE_QUIET, "CPU%d: Intel Extended MCE MSRs (%d) available\n",
1019 smp_processor_id(), nr_intel_ext_msrs);
1021 firstbank = mce_firstbank(c);
1024 static void mce_init(void)
1026 u32 l, h;
1027 int i;
1028 mctelem_cookie_t mctc;
1029 struct mca_summary bs;
1031 clear_in_cr4(X86_CR4_MCE);
1033 mce_barrier_init(&mce_inside_bar);
1034 mce_barrier_init(&mce_severity_bar);
1035 mce_barrier_init(&mce_trap_bar);
1036 spin_lock_init(&mce_logout_lock);
1038 /* log the machine checks left over from the previous reset.
1039 * This also clears all registers*/
1041 mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
1043 /* in the boot up stage, print out and also log in DOM0 boot process */
1044 if (bs.errcnt && mctc != NULL) {
1045 x86_mcinfo_dump(mctelem_dataptr(mctc));
1046 mctelem_commit(mctc);
1049 set_in_cr4(X86_CR4_MCE);
1051 for (i = firstbank; i < nr_mce_banks; i++)
1053 /* Some banks are shared across cores, use MCi_CTRL to judge whether
1054 * this bank has been initialized by other cores already. */
1055 rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h);
1056 if (!(l | h))
1058 /* if ctl is 0, this bank is never initialized */
1059 mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
1060 wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff);
1061 wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0);
1064 if (firstbank) /* if cmci enabled, firstbank = 0 */
1065 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
1068 /* p4/p6 family have similar MCA initialization process */
1069 int intel_mcheck_init(struct cpuinfo_x86 *c)
1071 _mce_cap_init(c);
1072 mce_printk(MCE_QUIET, "Intel machine check reporting enabled on CPU#%d.\n",
1073 smp_processor_id());
1075 /* machine check is available */
1076 x86_mce_vector_register(intel_machine_check);
1077 x86_mce_callback_register(intel_get_extended_msrs);
1078 mce_recoverable_register(intel_recoverable_scan);
1079 mce_need_clearbank_register(intel_need_clearbank_scan);
1081 mce_init();
1082 mce_intel_feature_init(c);
1083 mce_set_owner();
1085 open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
1086 return 1;
1089 int intel_mce_wrmsr(uint32_t msr, uint64_t val)
1091 int ret = 1;
1093 switch ( msr )
1095 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MC0_CTL2 + MAX_NR_BANKS - 1:
1096 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1097 "Guest should not write this MSR!\n");
1098 break;
1099 default:
1100 ret = 0;
1101 break;
1104 return ret;
1107 int intel_mce_rdmsr(uint32_t msr, uint64_t *val)
1109 int ret = 1;
1111 switch ( msr )
1113 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MC0_CTL2 + MAX_NR_BANKS - 1:
1114 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1115 "Guest should not read this MSR!\n");
1116 break;
1117 default:
1118 ret = 0;
1119 break;
1122 return ret;