debuggers.hg

view xen/arch/x86/cpu/mcheck/mce_intel.c @ 20911:088f1b01d852

x86 mca: Add MCE broadcast checkiing.

Some platform will broadcast MCE to all logical processor, while some
platform will not. Distinguish these platforms will be helpful for
unified MCA handler.

the "mce_fb" is a option to emulate the broadcast MCA in non-broadcast
platform. This is mainly for MCA software trigger.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jan 29 06:49:42 2010 +0000 (2010-01-29)
parents da5faf9f5df8
children ebd2495ec073
line source
1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/smp.h>
8 #include <xen/mm.h>
9 #include <asm/processor.h>
10 #include <public/sysctl.h>
11 #include <asm/system.h>
12 #include <asm/msr.h>
13 #include <asm/p2m.h>
14 #include "mce.h"
15 #include "x86_mca.h"
17 DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
18 DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
19 int cmci_support = 0;
20 int ser_support = 0;
22 static int nr_intel_ext_msrs = 0;
24 /* Below are for MCE handling */
25 struct mce_softirq_barrier {
26 atomic_t val;
27 atomic_t ingen;
28 atomic_t outgen;
29 };
31 static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar;
32 static struct mce_softirq_barrier mce_trap_bar;
34 /*
35 * mce_logout_lock should only be used in the trap handler,
36 * while MCIP has not been cleared yet in the global status
37 * register. Other use is not safe, since an MCE trap can
38 * happen at any moment, which would cause lock recursion.
39 */
40 static DEFINE_SPINLOCK(mce_logout_lock);
42 static atomic_t severity_cpu = ATOMIC_INIT(-1);
43 static atomic_t found_error = ATOMIC_INIT(0);
45 static void mce_barrier_enter(struct mce_softirq_barrier *);
46 static void mce_barrier_exit(struct mce_softirq_barrier *);
48 #ifdef CONFIG_X86_MCE_THERMAL
49 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
50 {
51 printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
52 smp_processor_id());
53 add_taint(TAINT_MACHINE_CHECK);
54 }
56 /* P4/Xeon Thermal transition interrupt handler */
57 static void intel_thermal_interrupt(struct cpu_user_regs *regs)
58 {
59 u32 l, h;
60 unsigned int cpu = smp_processor_id();
61 static s_time_t next[NR_CPUS];
63 ack_APIC_irq();
64 if (NOW() < next[cpu])
65 return;
67 next[cpu] = NOW() + MILLISECS(5000);
68 rdmsr(MSR_IA32_THERM_STATUS, l, h);
69 if (l & 0x1) {
70 printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
71 printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
72 cpu);
73 add_taint(TAINT_MACHINE_CHECK);
74 } else {
75 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
76 }
77 }
79 /* Thermal interrupt handler for this CPU setup */
80 static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs)
81 = unexpected_thermal_interrupt;
83 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
84 {
85 struct cpu_user_regs *old_regs = set_irq_regs(regs);
86 irq_enter();
87 vendor_thermal_interrupt(regs);
88 irq_exit();
89 set_irq_regs(old_regs);
90 }
92 /* P4/Xeon Thermal regulation detect and init */
93 static void intel_init_thermal(struct cpuinfo_x86 *c)
94 {
95 u32 l, h;
96 int tm2 = 0;
97 unsigned int cpu = smp_processor_id();
99 /* Thermal monitoring */
100 if (!cpu_has(c, X86_FEATURE_ACPI))
101 return; /* -ENODEV */
103 /* Clock modulation */
104 if (!cpu_has(c, X86_FEATURE_ACC))
105 return; /* -ENODEV */
107 /* first check if its enabled already, in which case there might
108 * be some SMM goo which handles it, so we can't even put a handler
109 * since it might be delivered via SMI already -zwanem.
110 */
111 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
112 h = apic_read(APIC_LVTTHMR);
113 if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
114 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
115 return; /* -EBUSY */
116 }
118 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
119 tm2 = 1;
121 /* check whether a vector already exists, temporarily masked? */
122 if (h & APIC_VECTOR_MASK) {
123 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
124 cpu, (h & APIC_VECTOR_MASK));
125 return; /* -EBUSY */
126 }
128 /* The temperature transition interrupt handler setup */
129 h = THERMAL_APIC_VECTOR; /* our delivery vector */
130 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
131 apic_write_around(APIC_LVTTHMR, h);
133 rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
134 wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
136 /* ok we're good to go... */
137 vendor_thermal_interrupt = intel_thermal_interrupt;
139 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
140 wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
142 l = apic_read (APIC_LVTTHMR);
143 apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
144 printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
145 cpu, tm2 ? "TM2" : "TM1");
146 return;
147 }
148 #endif /* CONFIG_X86_MCE_THERMAL */
150 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
151 {
152 if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
153 && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs ) {
154 ext->mc_msr[ext->mc_msrs].reg = msr;
155 mca_rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
156 ++ext->mc_msrs;
157 }
158 }
160 static enum mca_extinfo
161 intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
162 {
163 struct mcinfo_extended mc_ext;
165 if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
166 return MCA_EXTINFO_IGNORED;
168 /* this function will called when CAP(9).MCG_EXT_P = 1 */
169 memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
170 mc_ext.common.type = MC_TYPE_EXTENDED;
171 mc_ext.common.size = sizeof(mc_ext);
173 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EAX);
174 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EBX);
175 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ECX);
176 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EDX);
177 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ESI);
178 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EDI);
179 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EBP);
180 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ESP);
181 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EFLAGS);
182 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EIP);
183 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_MISC);
185 #ifdef __x86_64__
186 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R8);
187 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R9);
188 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R10);
189 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R11);
190 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R12);
191 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R13);
192 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R14);
193 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R15);
194 #endif
196 x86_mcinfo_add(mci, &mc_ext);
198 return MCA_EXTINFO_GLOBAL;
199 }
201 /* This node list records errors impacting a domain. when one
202 * MCE# happens, one error bank impacts a domain. This error node
203 * will be inserted to the tail of the per_dom data for vMCE# MSR
204 * virtualization. When one vMCE# injection is finished processing
205 * processed by guest, the corresponding node will be deleted.
206 * This node list is for GUEST vMCE# MSRS virtualization.
207 */
208 static struct bank_entry* alloc_bank_entry(void) {
209 struct bank_entry *entry;
211 entry = xmalloc(struct bank_entry);
212 if (!entry) {
213 printk(KERN_ERR "MCE: malloc bank_entry failed\n");
214 return NULL;
215 }
216 memset(entry, 0x0, sizeof(entry));
217 INIT_LIST_HEAD(&entry->list);
218 return entry;
219 }
221 /* Fill error bank info for #vMCE injection and GUEST vMCE#
222 * MSR virtualization data
223 * 1) Log down how many nr_injections of the impacted.
224 * 2) Copy MCE# error bank to impacted DOM node list,
225 for vMCE# MSRs virtualization
226 */
228 static int fill_vmsr_data(struct mcinfo_bank *mc_bank,
229 uint64_t gstatus) {
230 struct domain *d;
231 struct bank_entry *entry;
233 /* This error bank impacts one domain, we need to fill domain related
234 * data for vMCE MSRs virtualization and vMCE# injection */
235 if (mc_bank->mc_domid != (uint16_t)~0) {
236 d = get_domain_by_id(mc_bank->mc_domid);
238 /* Not impact a valid domain, skip this error of the bank */
239 if (!d) {
240 mce_printk(MCE_QUIET, "MCE: Not found valid impacted DOM\n");
241 return 0;
242 }
244 /* For HVM guest, Only when first vMCE is consumed by HVM guest successfully,
245 * will we generete another node and inject another vMCE
246 */
247 if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection > 0) )
248 {
249 mce_printk(MCE_QUIET, "MCE: HVM guest has not handled previous"
250 " vMCE yet!\n");
251 return -1;
252 }
253 entry = alloc_bank_entry();
254 if (entry == NULL)
255 return -1;
257 entry->mci_status = mc_bank->mc_status;
258 entry->mci_addr = mc_bank->mc_addr;
259 entry->mci_misc = mc_bank->mc_misc;
260 entry->bank = mc_bank->mc_bank;
262 spin_lock(&d->arch.vmca_msrs.lock);
263 /* New error Node, insert to the tail of the per_dom data */
264 list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
265 /* Fill MSR global status */
266 d->arch.vmca_msrs.mcg_status = gstatus;
267 /* New node impact the domain, need another vMCE# injection*/
268 d->arch.vmca_msrs.nr_injection++;
269 spin_unlock(&d->arch.vmca_msrs.lock);
271 mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d "
272 "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
273 mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr,
274 mc_bank->mc_domid);
275 }
276 return 0;
277 }
279 static int inject_mce(struct domain *d)
280 {
281 int cpu = smp_processor_id();
282 cpumask_t affinity;
284 /* PV guest and HVM guest have different vMCE# injection
285 * methods*/
287 if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
288 {
289 if (d->is_hvm)
290 {
291 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM DOM %d\n",
292 d->domain_id);
293 vcpu_kick(d->vcpu[0]);
294 }
295 /* PV guest including DOM0 */
296 else
297 {
298 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
299 d->domain_id);
300 if (guest_has_trap_callback
301 (d, 0, TRAP_machine_check))
302 {
303 d->vcpu[0]->cpu_affinity_tmp =
304 d->vcpu[0]->cpu_affinity;
305 cpus_clear(affinity);
306 cpu_set(cpu, affinity);
307 mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity, old %d\n", cpu,
308 d->vcpu[0]->processor);
309 vcpu_set_affinity(d->vcpu[0], &affinity);
310 vcpu_kick(d->vcpu[0]);
311 }
312 else
313 {
314 mce_printk(MCE_VERBOSE, "MCE: Kill PV guest with No MCE handler\n");
315 domain_crash(d);
316 }
317 }
318 }
319 else {
320 /* new vMCE comes while first one has not been injected yet,
321 * in this case, inject fail. [We can't lose this vMCE for
322 * the mce node's consistency].
323 */
324 mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected "
325 " to this DOM%d!\n", d->domain_id);
326 return -1;
327 }
328 return 0;
329 }
331 static void intel_UCR_handler(struct mcinfo_bank *bank,
332 struct mcinfo_global *global,
333 struct mcinfo_extended *extension,
334 struct mca_handle_result *result)
335 {
336 struct domain *d;
337 unsigned long mfn, gfn;
338 uint32_t status;
340 mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
341 result->result = MCA_NEED_RESET;
342 if (bank->mc_addr != 0) {
343 mfn = bank->mc_addr >> PAGE_SHIFT;
344 if (!offline_page(mfn, 1, &status)) {
345 /* This is free page */
346 if (status & PG_OFFLINE_OFFLINED)
347 result->result = MCA_RECOVERED;
348 else if (status & PG_OFFLINE_PENDING) {
349 /* This page has owner */
350 if (status & PG_OFFLINE_OWNED) {
351 result->result |= MCA_OWNER;
352 result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
353 mce_printk(MCE_QUIET, "MCE: This error page is ownded"
354 " by DOM %d\n", result->owner);
355 /* Fill vMCE# injection and vMCE# MSR virtualization "
356 * "related data */
357 bank->mc_domid = result->owner;
358 /* XXX: Cannot handle shared pages yet
359 * (this should identify all domains and gfn mapping to
360 * the mfn in question) */
361 BUG_ON( result->owner == DOMID_COW );
362 if ( result->owner != DOMID_XEN ) {
364 d = get_domain_by_id(result->owner);
365 if ( mca_ctl_conflict(bank, d) )
366 {
367 /* Guest has different MCE ctl with hypervisor */
368 put_domain(d);
369 return;
370 }
372 gfn =
373 mfn_to_gmfn(d, ((bank->mc_addr) >> PAGE_SHIFT));
374 bank->mc_addr = gfn << PAGE_SHIFT |
375 (bank->mc_addr & (PAGE_SIZE -1 ));
376 if (fill_vmsr_data(bank, global->mc_gstatus) == -1)
377 {
378 mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
379 "failed\n", result->owner);
380 put_domain(d);
381 domain_crash(d);
382 return;
383 }
384 /* We will inject vMCE to DOMU*/
385 if ( inject_mce(d) < 0 )
386 {
387 mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
388 " failed\n", d->domain_id);
389 put_domain(d);
390 domain_crash(d);
391 return;
392 }
393 /* Impacted domain go on with domain's recovery job
394 * if the domain has its own MCA handler.
395 * For xen, it has contained the error and finished
396 * its own recovery job.
397 */
398 result->result = MCA_RECOVERED;
399 put_domain(d);
400 }
401 }
402 }
403 }
404 }
405 }
407 #define INTEL_MAX_RECOVERY 2
408 struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] =
409 {{0x017A, intel_UCR_handler}, {0x00C0, intel_UCR_handler}};
411 /*
412 * Called from mctelem_process_deferred. Return 1 if the telemetry
413 * should be committed for dom0 consumption, 0 if it should be
414 * dismissed.
415 */
416 static int mce_action(mctelem_cookie_t mctc)
417 {
418 struct mc_info *local_mi;
419 uint32_t i;
420 struct mcinfo_common *mic = NULL;
421 struct mcinfo_global *mc_global;
422 struct mcinfo_bank *mc_bank;
423 struct mca_handle_result mca_res;
425 local_mi = (struct mc_info*)mctelem_dataptr(mctc);
426 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
427 if (mic == NULL) {
428 printk(KERN_ERR "MCE: get local buffer entry failed\n ");
429 return 0;
430 }
432 mc_global = (struct mcinfo_global *)mic;
434 /* Processing bank information */
435 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
437 for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
438 if (mic->type != MC_TYPE_BANK) {
439 continue;
440 }
441 mc_bank = (struct mcinfo_bank*)mic;
443 /* TODO: Add recovery actions here, such as page-offline, etc */
444 memset(&mca_res, 0x0f, sizeof(mca_res));
445 for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
446 if ( ((mc_bank->mc_status & 0xffff) ==
447 intel_recovery_handler[i].mca_code) ||
448 ((mc_bank->mc_status & 0xfff0) ==
449 intel_recovery_handler[i].mca_code)) {
450 /* For SRAR, OVER = 1 should have caused reset
451 * For SRAO, OVER = 1 skip recovery action, continue execution
452 */
453 if (!(mc_bank->mc_status & MCi_STATUS_OVER))
454 intel_recovery_handler[i].recovery_handler
455 (mc_bank, mc_global, NULL, &mca_res);
456 else {
457 if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
458 mca_res.result = MCA_NEED_RESET;
459 else
460 mca_res.result = MCA_NO_ACTION;
461 }
462 if (mca_res.result & MCA_OWNER)
463 mc_bank->mc_domid = mca_res.owner;
464 if (mca_res.result == MCA_NEED_RESET)
465 /* DOMID_XEN*/
466 mc_panic("MCE: Software recovery failed for the UCR "
467 "error\n");
468 else if (mca_res.result == MCA_RECOVERED)
469 mce_printk(MCE_VERBOSE, "MCE: The UCR error is"
470 "successfully recovered by software!\n");
471 else if (mca_res.result == MCA_NO_ACTION)
472 mce_printk(MCE_VERBOSE, "MCE: Overwrite SRAO error can't"
473 "do recover action, RIPV=1, let it be.\n");
474 break;
475 }
476 }
477 /* For SRAR, no defined recovery action should have caused reset
478 * in MCA Handler
479 */
480 if ( i >= INTEL_MAX_RECOVERY )
481 mce_printk(MCE_VERBOSE, "MCE: No software recovery action"
482 " found for this SRAO error\n");
484 }
485 return 1;
486 }
488 /* Softirq Handler for this MCE# processing */
489 static void mce_softirq(void)
490 {
491 int cpu = smp_processor_id();
492 unsigned int workcpu;
494 mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
496 mce_barrier_enter(&mce_inside_bar);
498 /*
499 * Everybody is here. Now let's see who gets to do the
500 * recovery work. Right now we just see if there's a CPU
501 * that did not have any problems, and pick that one.
502 *
503 * First, just set a default value: the last CPU who reaches this
504 * will overwrite the value and become the default.
505 */
507 atomic_set(&severity_cpu, cpu);
509 mce_barrier_enter(&mce_severity_bar);
510 if (!mctelem_has_deferred(cpu))
511 atomic_set(&severity_cpu, cpu);
512 mce_barrier_exit(&mce_severity_bar);
514 /* We choose severity_cpu for further processing */
515 if (atomic_read(&severity_cpu) == cpu) {
517 mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
519 /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
520 * vMCE MSRs virtualization buffer
521 */
522 for_each_online_cpu(workcpu) {
523 mctelem_process_deferred(workcpu, mce_action);
524 }
526 /* Step2: Send Log to DOM0 through vIRQ */
527 if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
528 mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
529 send_guest_global_virq(dom0, VIRQ_MCA);
530 }
531 }
533 mce_barrier_exit(&mce_inside_bar);
534 }
536 /* Machine Check owner judge algorithm:
537 * When error happens, all cpus serially read its msr banks.
538 * The first CPU who fetches the error bank's info will clear
539 * this bank. Later readers can't get any infor again.
540 * The first CPU is the actual mce_owner
541 *
542 * For Fatal (pcc=1) error, it might cause machine crash
543 * before we're able to log. For avoiding log missing, we adopt two
544 * round scanning:
545 * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
546 * All MCE banks are sticky, when boot up, MCE polling mechanism
547 * will help to collect and log those MCE errors.
548 * Round2: Do all MCE processing logic as normal.
549 */
551 static void mce_panic_check(void)
552 {
553 if (is_mc_panic) {
554 local_irq_enable();
555 for ( ; ; )
556 halt();
557 }
558 }
560 /*
561 * Initialize a barrier. Just set it to 0.
562 */
563 static void mce_barrier_init(struct mce_softirq_barrier *bar)
564 {
565 atomic_set(&bar->val, 0);
566 atomic_set(&bar->ingen, 0);
567 atomic_set(&bar->outgen, 0);
568 }
570 #if 0
571 /*
572 * This function will need to be used when offlining a CPU in the
573 * recovery actions.
574 *
575 * Decrement a barrier only. Needed for cases where the CPU
576 * in question can't do it itself (e.g. it is being offlined).
577 */
578 static void mce_barrier_dec(struct mce_softirq_barrier *bar)
579 {
580 atomic_inc(&bar->outgen);
581 wmb();
582 atomic_dec(&bar->val);
583 }
584 #endif
586 static void mce_spin_lock(spinlock_t *lk)
587 {
588 while (!spin_trylock(lk)) {
589 cpu_relax();
590 mce_panic_check();
591 }
592 }
594 static void mce_spin_unlock(spinlock_t *lk)
595 {
596 spin_unlock(lk);
597 }
599 /*
600 * Increment the generation number and the value. The generation number
601 * is incremented when entering a barrier. This way, it can be checked
602 * on exit if a CPU is trying to re-enter the barrier. This can happen
603 * if the first CPU to make it out immediately exits or re-enters, while
604 * another CPU that is still in the loop becomes otherwise occupied
605 * (e.g. it needs to service an interrupt, etc), missing the value
606 * it's waiting for.
607 *
608 * These barrier functions should always be paired, so that the
609 * counter value will reach 0 again after all CPUs have exited.
610 */
611 static void mce_barrier_enter(struct mce_softirq_barrier *bar)
612 {
613 int gen;
615 if (!mce_broadcast)
616 return;
617 atomic_inc(&bar->ingen);
618 gen = atomic_read(&bar->outgen);
619 mb();
620 atomic_inc(&bar->val);
621 while ( atomic_read(&bar->val) != num_online_cpus() &&
622 atomic_read(&bar->outgen) == gen) {
623 mb();
624 mce_panic_check();
625 }
626 }
628 static void mce_barrier_exit(struct mce_softirq_barrier *bar)
629 {
630 int gen;
632 if (!mce_broadcast)
633 return;
634 atomic_inc(&bar->outgen);
635 gen = atomic_read(&bar->ingen);
636 mb();
637 atomic_dec(&bar->val);
638 while ( atomic_read(&bar->val) != 0 &&
639 atomic_read(&bar->ingen) == gen ) {
640 mb();
641 mce_panic_check();
642 }
643 }
645 #if 0
646 static void mce_barrier(struct mce_softirq_barrier *bar)
647 {
648 mce_barrier_enter(bar);
649 mce_barrier_exit(bar);
650 }
651 #endif
653 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
654 {
655 uint64_t gstatus;
656 mctelem_cookie_t mctc = NULL;
657 struct mca_summary bs;
658 cpu_banks_t clear_bank;
660 mce_spin_lock(&mce_logout_lock);
662 memset( &clear_bank, 0x0, sizeof(cpu_banks_t));
663 mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs, &clear_bank);
665 if (bs.errcnt) {
666 /*
667 * Uncorrected errors must be dealth with in softirq context.
668 */
669 if (bs.uc || bs.pcc) {
670 add_taint(TAINT_MACHINE_CHECK);
671 if (mctc != NULL)
672 mctelem_defer(mctc);
673 /*
674 * For PCC=1 and can't be recovered, context is lost, so reboot now without
675 * clearing the banks, and deal with the telemetry after reboot
676 * (the MSRs are sticky)
677 */
678 if (bs.pcc)
679 mc_panic("State lost due to machine check exception.\n");
680 if (!bs.ripv)
681 mc_panic("RIPV =0 can't resume execution!\n");
682 if (!bs.recoverable)
683 mc_panic("Machine check exception software recovery fail.\n");
684 } else {
685 if (mctc != NULL)
686 mctelem_commit(mctc);
687 }
688 atomic_set(&found_error, 1);
690 mce_printk(MCE_VERBOSE, "MCE: clear_bank map %lx on CPU%d\n",
691 *((unsigned long*)clear_bank), smp_processor_id());
692 mcheck_mca_clearbanks(clear_bank);
693 /* Print MCE error */
694 x86_mcinfo_dump(mctelem_dataptr(mctc));
696 } else {
697 if (mctc != NULL)
698 mctelem_dismiss(mctc);
699 }
700 mce_spin_unlock(&mce_logout_lock);
702 /*
703 * Wait until everybody has processed the trap.
704 */
705 mce_barrier_enter(&mce_trap_bar);
706 /* According to latest MCA OS writer guide, if no error bank found
707 * on all cpus, something unexpected happening, we can't do any
708 * recovery job but to reset the system.
709 */
710 if (atomic_read(&found_error) == 0)
711 mc_panic("Unexpected condition for the MCE handler, need reset\n");
712 mce_barrier_exit(&mce_trap_bar);
714 /* Clear error finding flags after all cpus finishes above judgement */
715 mce_barrier_enter(&mce_trap_bar);
716 if (atomic_read(&found_error)) {
717 mce_printk(MCE_VERBOSE, "MCE: Choose one CPU "
718 "to clear error finding flag\n ");
719 atomic_set(&found_error, 0);
720 }
721 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
722 if ((gstatus & MCG_STATUS_MCIP) != 0) {
723 mce_printk(MCE_VERBOSE, "MCE: Clear MCIP@ last step");
724 mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
725 }
726 mce_barrier_exit(&mce_trap_bar);
728 raise_softirq(MACHINE_CHECK_SOFTIRQ);
729 }
731 /* According to MCA OS writer guide, CMCI handler need to clear bank when
732 * 1) CE (UC = 0)
733 * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
734 * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
735 * MCA handler need to clear bank when
736 * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
737 * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
738 * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
739 */
741 static int intel_need_clearbank_scan(enum mca_source who, u64 status)
742 {
743 if ( who == MCA_CMCI_HANDLER) {
744 /* CMCI need clear bank */
745 if ( !(status & MCi_STATUS_UC) )
746 return 1;
747 /* Spurious need clear bank */
748 else if ( ser_support && !(status & MCi_STATUS_OVER)
749 && !(status & MCi_STATUS_EN) )
750 return 1;
751 /* UCNA OVER = 0 need clear bank */
752 else if ( ser_support && !(status & MCi_STATUS_OVER)
753 && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
754 && !(status & MCi_STATUS_AR))
755 return 1;
756 /* Only Log, no clear */
757 else return 0;
758 }
759 else if ( who == MCA_MCE_SCAN) {
760 /* Spurious need clear bank */
761 if ( ser_support && !(status & MCi_STATUS_OVER)
762 && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN))
763 return 1;
764 /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
765 else if ( ser_support && (status & MCi_STATUS_UC)
766 && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR )
767 && (status & MCi_STATUS_OVER) )
768 return 1;
769 /* SRAO need clear bank */
770 else if ( ser_support && !(status & MCi_STATUS_AR)
771 && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC))
772 return 1;
773 else
774 return 0;
775 }
777 return 1;
778 }
780 /* MCE continues/is recoverable when
781 * 1) CE UC = 0
782 * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
783 * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
784 * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
785 * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
786 */
787 static int intel_recoverable_scan(u64 status)
788 {
790 if ( !(status & MCi_STATUS_UC ) )
791 return 1;
792 else if ( ser_support && !(status & MCi_STATUS_EN)
793 && !(status & MCi_STATUS_OVER) )
794 return 1;
795 /* SRAR error */
796 else if ( ser_support && !(status & MCi_STATUS_OVER)
797 && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
798 && (status & MCi_STATUS_AR) ) {
799 mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined currently.\n");
800 return 0;
801 }
802 /* SRAO error */
803 else if (ser_support && !(status & MCi_STATUS_PCC)
804 && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
805 && (status & MCi_STATUS_EN))
806 return 1;
807 /* UCNA error */
808 else if (ser_support && !(status & MCi_STATUS_OVER)
809 && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
810 && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR))
811 return 1;
812 return 0;
813 }
815 static DEFINE_SPINLOCK(cmci_discover_lock);
817 /*
818 * Discover bank sharing using the algorithm recommended in the SDM.
819 */
820 static int do_cmci_discover(int i)
821 {
822 unsigned msr = MSR_IA32_MC0_CTL2 + i;
823 u64 val;
825 rdmsrl(msr, val);
826 /* Some other CPU already owns this bank. */
827 if (val & CMCI_EN) {
828 clear_bit(i, __get_cpu_var(mce_banks_owned));
829 goto out;
830 }
831 wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
832 rdmsrl(msr, val);
834 if (!(val & CMCI_EN)) {
835 /* This bank does not support CMCI. Polling timer has to handle it. */
836 set_bit(i, __get_cpu_var(no_cmci_banks));
837 return 0;
838 }
839 set_bit(i, __get_cpu_var(mce_banks_owned));
840 out:
841 clear_bit(i, __get_cpu_var(no_cmci_banks));
842 return 1;
843 }
845 static void cmci_discover(void)
846 {
847 unsigned long flags;
848 int i;
849 mctelem_cookie_t mctc;
850 struct mca_summary bs;
852 mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%d\n", smp_processor_id());
854 spin_lock_irqsave(&cmci_discover_lock, flags);
856 for (i = 0; i < nr_mce_banks; i++)
857 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
858 do_cmci_discover(i);
860 spin_unlock_irqrestore(&cmci_discover_lock, flags);
862 /* In case CMCI happended when do owner change.
863 * If CMCI happened yet not processed immediately,
864 * MCi_status (error_count bit 38~52) is not cleared,
865 * the CMCI interrupt will never be triggered again.
866 */
868 mctc = mcheck_mca_logout(
869 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
871 if (bs.errcnt && mctc != NULL) {
872 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
873 mctelem_commit(mctc);
874 send_guest_global_virq(dom0, VIRQ_MCA);
875 } else {
876 x86_mcinfo_dump(mctelem_dataptr(mctc));
877 mctelem_dismiss(mctc);
878 }
879 } else if (mctc != NULL)
880 mctelem_dismiss(mctc);
882 mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
883 smp_processor_id(),
884 *((unsigned long *)__get_cpu_var(mce_banks_owned)),
885 *((unsigned long *)__get_cpu_var(no_cmci_banks)));
886 }
888 /*
889 * Define an owner for each bank. Banks can be shared between CPUs
890 * and to avoid reporting events multiple times always set up one
891 * CPU as owner.
892 *
893 * The assignment has to be redone when CPUs go offline and
894 * any of the owners goes away. Also pollers run in parallel so we
895 * have to be careful to update the banks in a way that doesn't
896 * lose or duplicate events.
897 */
899 static void mce_set_owner(void)
900 {
901 if (!cmci_support || mce_disabled == 1)
902 return;
904 cmci_discover();
905 }
907 static void __cpu_mcheck_distribute_cmci(void *unused)
908 {
909 cmci_discover();
910 }
912 void cpu_mcheck_distribute_cmci(void)
913 {
914 if (cmci_support && !mce_disabled)
915 on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
916 }
918 static void clear_cmci(void)
919 {
920 int i;
922 if (!cmci_support || mce_disabled == 1)
923 return;
925 mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%d\n",
926 smp_processor_id());
928 for (i = 0; i < nr_mce_banks; i++) {
929 unsigned msr = MSR_IA32_MC0_CTL2 + i;
930 u64 val;
931 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
932 continue;
933 rdmsrl(msr, val);
934 if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
935 wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
936 clear_bit(i, __get_cpu_var(mce_banks_owned));
937 }
938 }
940 void cpu_mcheck_disable(void)
941 {
942 clear_in_cr4(X86_CR4_MCE);
944 if (cmci_support && !mce_disabled)
945 clear_cmci();
946 }
948 static void intel_init_cmci(struct cpuinfo_x86 *c)
949 {
950 u32 l, apic;
951 int cpu = smp_processor_id();
953 if (!mce_available(c) || !cmci_support) {
954 mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
955 return;
956 }
958 apic = apic_read(APIC_CMCI);
959 if ( apic & APIC_VECTOR_MASK )
960 {
961 mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
962 cpu, ( apic & APIC_VECTOR_MASK ));
963 return;
964 }
966 apic = CMCI_APIC_VECTOR;
967 apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
968 apic_write_around(APIC_CMCI, apic);
970 l = apic_read(APIC_CMCI);
971 apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
972 }
974 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
975 {
976 mctelem_cookie_t mctc;
977 struct mca_summary bs;
978 struct cpu_user_regs *old_regs = set_irq_regs(regs);
980 ack_APIC_irq();
981 irq_enter();
983 mctc = mcheck_mca_logout(
984 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
986 if (bs.errcnt && mctc != NULL) {
987 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
988 mctelem_commit(mctc);
989 mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
990 send_guest_global_virq(dom0, VIRQ_MCA);
991 } else {
992 x86_mcinfo_dump(mctelem_dataptr(mctc));
993 mctelem_dismiss(mctc);
994 }
995 } else if (mctc != NULL)
996 mctelem_dismiss(mctc);
998 irq_exit();
999 set_irq_regs(old_regs);
1002 void mce_intel_feature_init(struct cpuinfo_x86 *c)
1005 #ifdef CONFIG_X86_MCE_THERMAL
1006 intel_init_thermal(c);
1007 #endif
1008 intel_init_cmci(c);
1011 static void _mce_cap_init(struct cpuinfo_x86 *c)
1013 u32 l = mce_cap_init();
1015 if ((l & MCG_CMCI_P) && cpu_has_apic)
1016 cmci_support = 1;
1018 /* Support Software Error Recovery */
1019 if (l & MCG_SER_P)
1020 ser_support = 1;
1022 if (l & MCG_EXT_P)
1024 nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff;
1025 mce_printk (MCE_QUIET, "CPU%d: Intel Extended MCE MSRs (%d) available\n",
1026 smp_processor_id(), nr_intel_ext_msrs);
1028 firstbank = mce_firstbank(c);
1031 static void mce_init(void)
1033 u32 l, h;
1034 int i;
1035 mctelem_cookie_t mctc;
1036 struct mca_summary bs;
1038 clear_in_cr4(X86_CR4_MCE);
1040 mce_barrier_init(&mce_inside_bar);
1041 mce_barrier_init(&mce_severity_bar);
1042 mce_barrier_init(&mce_trap_bar);
1043 spin_lock_init(&mce_logout_lock);
1045 /* log the machine checks left over from the previous reset.
1046 * This also clears all registers*/
1048 mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
1050 /* in the boot up stage, print out and also log in DOM0 boot process */
1051 if (bs.errcnt && mctc != NULL) {
1052 x86_mcinfo_dump(mctelem_dataptr(mctc));
1053 mctelem_commit(mctc);
1056 set_in_cr4(X86_CR4_MCE);
1058 for (i = firstbank; i < nr_mce_banks; i++)
1060 /* Some banks are shared across cores, use MCi_CTRL to judge whether
1061 * this bank has been initialized by other cores already. */
1062 rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h);
1063 if (!(l | h))
1065 /* if ctl is 0, this bank is never initialized */
1066 mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
1067 wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff);
1068 wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0);
1071 if (firstbank) /* if cmci enabled, firstbank = 0 */
1072 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
1075 /* p4/p6 family have similar MCA initialization process */
1076 int intel_mcheck_init(struct cpuinfo_x86 *c)
1078 _mce_cap_init(c);
1079 mce_printk(MCE_QUIET, "Intel machine check reporting enabled on CPU#%d.\n",
1080 smp_processor_id());
1082 /* machine check is available */
1083 x86_mce_vector_register(intel_machine_check);
1084 x86_mce_callback_register(intel_get_extended_msrs);
1085 mce_recoverable_register(intel_recoverable_scan);
1086 mce_need_clearbank_register(intel_need_clearbank_scan);
1088 mce_init();
1089 mce_intel_feature_init(c);
1090 mce_set_owner();
1092 open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
1093 return 1;
1096 int intel_mce_wrmsr(uint32_t msr, uint64_t val)
1098 int ret = 1;
1100 switch ( msr )
1102 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MC0_CTL2 + MAX_NR_BANKS - 1:
1103 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1104 "Guest should not write this MSR!\n");
1105 break;
1106 default:
1107 ret = 0;
1108 break;
1111 return ret;
1114 int intel_mce_rdmsr(uint32_t msr, uint64_t *val)
1116 int ret = 1;
1118 switch ( msr )
1120 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MC0_CTL2 + MAX_NR_BANKS - 1:
1121 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1122 "Guest should not read this MSR!\n");
1123 break;
1124 default:
1125 ret = 0;
1126 break;
1129 return ret;