debuggers.hg

view xen/arch/x86/cpu/mcheck/mce_intel.c @ 22906:700ac6445812

Now add KDB to the non-kdb tree
author Mukesh Rathor
date Thu Feb 03 15:42:41 2011 -0800 (2011-02-03)
parents e8acb9753ff1
children
line source
1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/smp.h>
8 #include <xen/mm.h>
9 #include <xen/cpu.h>
10 #include <asm/processor.h>
11 #include <public/sysctl.h>
12 #include <asm/system.h>
13 #include <asm/msr.h>
14 #include <asm/p2m.h>
15 #include <asm/mce.h>
16 #include <asm/apic.h>
17 #include "mce.h"
18 #include "x86_mca.h"
20 DEFINE_PER_CPU(struct mca_banks *, mce_banks_owned);
21 DEFINE_PER_CPU(struct mca_banks *, no_cmci_banks);
22 DEFINE_PER_CPU(struct mca_banks *, mce_clear_banks);
23 bool_t __read_mostly cmci_support = 0;
24 static bool_t __read_mostly ser_support = 0;
25 static bool_t __read_mostly mce_force_broadcast;
26 boolean_param("mce_fb", mce_force_broadcast);
28 static int nr_intel_ext_msrs = 0;
30 /* Thermal Hanlding */
31 #ifdef CONFIG_X86_MCE_THERMAL
32 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
33 {
34 printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
35 smp_processor_id());
36 add_taint(TAINT_MACHINE_CHECK);
37 }
39 /* P4/Xeon Thermal transition interrupt handler */
40 static void intel_thermal_interrupt(struct cpu_user_regs *regs)
41 {
42 uint64_t msr_content;
43 unsigned int cpu = smp_processor_id();
44 static DEFINE_PER_CPU(s_time_t, next);
46 ack_APIC_irq();
47 if (NOW() < per_cpu(next, cpu))
48 return;
50 per_cpu(next, cpu) = NOW() + MILLISECS(5000);
51 rdmsrl(MSR_IA32_THERM_STATUS, msr_content);
52 if (msr_content & 0x1) {
53 printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
54 printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
55 cpu);
56 add_taint(TAINT_MACHINE_CHECK);
57 } else {
58 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
59 }
60 }
62 /* Thermal interrupt handler for this CPU setup */
63 static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs)
64 = unexpected_thermal_interrupt;
66 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
67 {
68 struct cpu_user_regs *old_regs = set_irq_regs(regs);
69 irq_enter();
70 vendor_thermal_interrupt(regs);
71 irq_exit();
72 set_irq_regs(old_regs);
73 }
75 /* P4/Xeon Thermal regulation detect and init */
76 static void intel_init_thermal(struct cpuinfo_x86 *c)
77 {
78 uint64_t msr_content;
79 uint32_t val;
80 int tm2 = 0;
81 unsigned int cpu = smp_processor_id();
83 /* Thermal monitoring */
84 if (!cpu_has(c, X86_FEATURE_ACPI))
85 return; /* -ENODEV */
87 /* Clock modulation */
88 if (!cpu_has(c, X86_FEATURE_ACC))
89 return; /* -ENODEV */
91 /* first check if its enabled already, in which case there might
92 * be some SMM goo which handles it, so we can't even put a handler
93 * since it might be delivered via SMI already -zwanem.
94 */
95 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
96 val = apic_read(APIC_LVTTHMR);
97 if ((msr_content & (1ULL<<3)) && (val & APIC_DM_SMI)) {
98 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
99 return; /* -EBUSY */
100 }
102 if (cpu_has(c, X86_FEATURE_TM2) && (msr_content & (1ULL << 13)))
103 tm2 = 1;
105 /* check whether a vector already exists, temporarily masked? */
106 if (val & APIC_VECTOR_MASK) {
107 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
108 cpu, (val & APIC_VECTOR_MASK));
109 return; /* -EBUSY */
110 }
112 /* The temperature transition interrupt handler setup */
113 val = THERMAL_APIC_VECTOR; /* our delivery vector */
114 val |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
115 apic_write_around(APIC_LVTTHMR, val);
117 rdmsrl(MSR_IA32_THERM_INTERRUPT, msr_content);
118 wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_content | 0x03);
120 /* ok we're good to go... */
121 vendor_thermal_interrupt = intel_thermal_interrupt;
123 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
124 wrmsrl(MSR_IA32_MISC_ENABLE, msr_content | (1ULL<<3));
126 apic_write_around(APIC_LVTTHMR, apic_read(APIC_LVTTHMR) & ~APIC_LVT_MASKED);
127 if (opt_cpu_info)
128 printk(KERN_INFO "CPU%u: Thermal monitoring enabled (%s)\n",
129 cpu, tm2 ? "TM2" : "TM1");
130 return;
131 }
132 #endif /* CONFIG_X86_MCE_THERMAL */
134 /* MCE handling */
135 struct mce_softirq_barrier {
136 atomic_t val;
137 atomic_t ingen;
138 atomic_t outgen;
139 };
141 static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar;
142 static struct mce_softirq_barrier mce_trap_bar;
144 /*
145 * mce_logout_lock should only be used in the trap handler,
146 * while MCIP has not been cleared yet in the global status
147 * register. Other use is not safe, since an MCE trap can
148 * happen at any moment, which would cause lock recursion.
149 */
150 static DEFINE_SPINLOCK(mce_logout_lock);
152 static atomic_t severity_cpu = ATOMIC_INIT(-1);
153 static atomic_t found_error = ATOMIC_INIT(0);
154 static cpumask_t mce_fatal_cpus;
156 static void mce_barrier_enter(struct mce_softirq_barrier *);
157 static void mce_barrier_exit(struct mce_softirq_barrier *);
159 struct mca_error_handler *mce_dhandlers, *mce_uhandlers;
160 int mce_dhandler_num, mce_uhandler_num;
162 enum mce_result
163 {
164 MCER_NOERROR,
165 MCER_RECOVERED,
166 /* Not recoverd, but can continue */
167 MCER_CONTINUE,
168 MCER_RESET,
169 };
171 /* Maybe called in MCE context, no lock, no printk */
172 static enum mce_result mce_action(struct cpu_user_regs *regs,
173 mctelem_cookie_t mctc)
174 {
175 struct mc_info *local_mi;
176 enum mce_result ret = MCER_NOERROR;
177 uint32_t i;
178 struct mcinfo_common *mic = NULL;
179 struct mca_handle_result mca_res;
180 struct mca_binfo binfo;
181 struct mca_error_handler *handlers = mce_dhandlers;
182 int handler_num = mce_dhandler_num;
184 /* When in mce context, regs is valid */
185 if (regs)
186 {
187 handler_num = mce_uhandler_num;
188 handlers = mce_uhandlers;
189 }
191 /* At least a default handler should be registerd */
192 ASSERT(handler_num);
194 local_mi = (struct mc_info*)mctelem_dataptr(mctc);
195 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
196 if (mic == NULL) {
197 printk(KERN_ERR "MCE: get local buffer entry failed\n ");
198 return MCER_CONTINUE;
199 }
201 memset(&binfo, 0, sizeof(binfo));
202 binfo.mig = (struct mcinfo_global *)mic;
203 binfo.mi = local_mi;
205 /* Processing bank information */
206 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
208 for ( ; ret != MCER_RESET && mic && mic->size;
209 mic = x86_mcinfo_next(mic) )
210 {
211 if (mic->type != MC_TYPE_BANK) {
212 continue;
213 }
214 binfo.mib = (struct mcinfo_bank*)mic;
215 binfo.bank = binfo.mib->mc_bank;
216 memset(&mca_res, 0x0f, sizeof(mca_res));
217 for ( i = 0; i < handler_num; i++ ) {
218 if (handlers[i].owned_error(binfo.mib->mc_status))
219 {
220 handlers[i].recovery_handler(binfo.bank, &binfo, &mca_res);
222 if (mca_res.result & MCA_OWNER)
223 binfo.mib->mc_domid = mca_res.owner;
225 if (mca_res.result == MCA_NEED_RESET)
226 ret = MCER_RESET;
227 else if (mca_res.result == MCA_RECOVERED)
228 {
229 if (ret < MCER_RECOVERED)
230 ret = MCER_RECOVERED;
231 }
232 else if (mca_res.result == MCA_NO_ACTION)
233 {
234 if (ret < MCER_CONTINUE)
235 ret = MCER_CONTINUE;
236 }
237 break;
238 }
239 }
240 ASSERT(i != handler_num);
241 }
243 return ret;
244 }
246 /*
247 * Called from mctelem_process_deferred. Return 1 if the telemetry
248 * should be committed for dom0 consumption, 0 if it should be
249 * dismissed.
250 */
251 static int mce_delayed_action(mctelem_cookie_t mctc)
252 {
253 enum mce_result result;
254 int ret = 0;
256 result = mce_action(NULL, mctc);
258 switch (result)
259 {
260 case MCER_RESET:
261 dprintk(XENLOG_ERR, "MCE delayed action failed\n");
262 x86_mcinfo_dump(mctelem_dataptr(mctc));
263 panic("MCE: Software recovery failed for the UCR\n");
264 break;
265 case MCER_RECOVERED:
266 dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
267 ret = 1;
268 break;
269 case MCER_CONTINUE:
270 dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
271 "system is tainted\n");
272 x86_mcinfo_dump(mctelem_dataptr(mctc));
273 ret = 1;
274 break;
275 default:
276 ret = 0;
277 break;
278 }
279 return ret;
280 }
282 /* Softirq Handler for this MCE# processing */
283 static void mce_softirq(void)
284 {
285 int cpu = smp_processor_id();
286 unsigned int workcpu;
288 mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
290 mce_barrier_enter(&mce_inside_bar);
292 /*
293 * Everybody is here. Now let's see who gets to do the
294 * recovery work. Right now we just see if there's a CPU
295 * that did not have any problems, and pick that one.
296 *
297 * First, just set a default value: the last CPU who reaches this
298 * will overwrite the value and become the default.
299 */
301 atomic_set(&severity_cpu, cpu);
303 mce_barrier_enter(&mce_severity_bar);
304 if (!mctelem_has_deferred(cpu))
305 atomic_set(&severity_cpu, cpu);
306 mce_barrier_exit(&mce_severity_bar);
308 /* We choose severity_cpu for further processing */
309 if (atomic_read(&severity_cpu) == cpu) {
311 mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
313 /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
314 * vMCE MSRs virtualization buffer
315 */
316 for_each_online_cpu(workcpu) {
317 mctelem_process_deferred(workcpu, mce_delayed_action);
318 }
320 /* Step2: Send Log to DOM0 through vIRQ */
321 if (dom0_vmce_enabled()) {
322 mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
323 send_guest_global_virq(dom0, VIRQ_MCA);
324 }
325 }
327 mce_barrier_exit(&mce_inside_bar);
328 }
330 /*
331 * Return:
332 * -1: if system can't be recoved
333 * 0: Continoue to next step
334 */
335 static int mce_urgent_action(struct cpu_user_regs *regs,
336 mctelem_cookie_t mctc)
337 {
338 uint64_t gstatus;
340 if ( mctc == NULL)
341 return 0;
343 gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
344 /* Xen is not pre-emptible */
345 if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
346 return 0;
348 return mce_action(regs, mctc) == MCER_RESET ? -1 : 0;
349 }
351 /* Machine Check owner judge algorithm:
352 * When error happens, all cpus serially read its msr banks.
353 * The first CPU who fetches the error bank's info will clear
354 * this bank. Later readers can't get any infor again.
355 * The first CPU is the actual mce_owner
356 *
357 * For Fatal (pcc=1) error, it might cause machine crash
358 * before we're able to log. For avoiding log missing, we adopt two
359 * round scanning:
360 * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
361 * All MCE banks are sticky, when boot up, MCE polling mechanism
362 * will help to collect and log those MCE errors.
363 * Round2: Do all MCE processing logic as normal.
364 */
366 static void mce_panic_check(void)
367 {
368 if (is_mc_panic) {
369 local_irq_enable();
370 for ( ; ; )
371 halt();
372 }
373 }
375 /*
376 * Initialize a barrier. Just set it to 0.
377 */
378 static void mce_barrier_init(struct mce_softirq_barrier *bar)
379 {
380 atomic_set(&bar->val, 0);
381 atomic_set(&bar->ingen, 0);
382 atomic_set(&bar->outgen, 0);
383 }
385 static void mce_handler_init(void)
386 {
387 if (smp_processor_id() != 0)
388 return;
390 /* callback register, do we really need so many callback? */
391 /* mce handler data initialization */
392 mce_barrier_init(&mce_inside_bar);
393 mce_barrier_init(&mce_severity_bar);
394 mce_barrier_init(&mce_trap_bar);
395 spin_lock_init(&mce_logout_lock);
396 open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
397 }
398 #if 0
399 /*
400 * This function will need to be used when offlining a CPU in the
401 * recovery actions.
402 *
403 * Decrement a barrier only. Needed for cases where the CPU
404 * in question can't do it itself (e.g. it is being offlined).
405 */
406 static void mce_barrier_dec(struct mce_softirq_barrier *bar)
407 {
408 atomic_inc(&bar->outgen);
409 wmb();
410 atomic_dec(&bar->val);
411 }
412 #endif
414 static void mce_spin_lock(spinlock_t *lk)
415 {
416 while (!spin_trylock(lk)) {
417 cpu_relax();
418 mce_panic_check();
419 }
420 }
422 static void mce_spin_unlock(spinlock_t *lk)
423 {
424 spin_unlock(lk);
425 }
427 /*
428 * Increment the generation number and the value. The generation number
429 * is incremented when entering a barrier. This way, it can be checked
430 * on exit if a CPU is trying to re-enter the barrier. This can happen
431 * if the first CPU to make it out immediately exits or re-enters, while
432 * another CPU that is still in the loop becomes otherwise occupied
433 * (e.g. it needs to service an interrupt, etc), missing the value
434 * it's waiting for.
435 *
436 * These barrier functions should always be paired, so that the
437 * counter value will reach 0 again after all CPUs have exited.
438 */
439 static void mce_barrier_enter(struct mce_softirq_barrier *bar)
440 {
441 int gen;
443 if (!mce_broadcast)
444 return;
445 atomic_inc(&bar->ingen);
446 gen = atomic_read(&bar->outgen);
447 mb();
448 atomic_inc(&bar->val);
449 while ( atomic_read(&bar->val) != num_online_cpus() &&
450 atomic_read(&bar->outgen) == gen) {
451 mb();
452 mce_panic_check();
453 }
454 }
456 static void mce_barrier_exit(struct mce_softirq_barrier *bar)
457 {
458 int gen;
460 if (!mce_broadcast)
461 return;
462 atomic_inc(&bar->outgen);
463 gen = atomic_read(&bar->ingen);
464 mb();
465 atomic_dec(&bar->val);
466 while ( atomic_read(&bar->val) != 0 &&
467 atomic_read(&bar->ingen) == gen ) {
468 mb();
469 mce_panic_check();
470 }
471 }
473 #if 0
474 static void mce_barrier(struct mce_softirq_barrier *bar)
475 {
476 mce_barrier_enter(bar);
477 mce_barrier_exit(bar);
478 }
479 #endif
481 /* Intel MCE handler */
482 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
483 {
484 if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
485 && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs ) {
486 ext->mc_msr[ext->mc_msrs].reg = msr;
487 rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
488 ++ext->mc_msrs;
489 }
490 }
493 struct mcinfo_extended *
494 intel_get_extended_msrs(struct mcinfo_global *mig, struct mc_info *mi)
495 {
496 struct mcinfo_extended *mc_ext;
497 int i;
499 /*
500 * According to spec, processor _support_ 64 bit will always
501 * have MSR beyond IA32_MCG_MISC
502 */
503 if (!mi|| !mig || nr_intel_ext_msrs == 0 ||
504 !(mig->mc_gstatus & MCG_STATUS_EIPV))
505 return NULL;
507 mc_ext = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_extended));
508 if (!mc_ext)
509 {
510 mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
511 return NULL;
512 }
514 /* this function will called when CAP(9).MCG_EXT_P = 1 */
515 memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
516 mc_ext->common.type = MC_TYPE_EXTENDED;
517 mc_ext->common.size = sizeof(struct mcinfo_extended);
519 for (i = MSR_IA32_MCG_EAX; i <= MSR_IA32_MCG_MISC; i++)
520 intel_get_extended_msr(mc_ext, i);
522 #ifdef __x86_64__
523 for (i = MSR_IA32_MCG_R8; i <= MSR_IA32_MCG_R15; i++)
524 intel_get_extended_msr(mc_ext, i);
525 #endif
527 return mc_ext;
528 }
530 enum intel_mce_type
531 {
532 intel_mce_invalid,
533 intel_mce_fatal,
534 intel_mce_corrected,
535 intel_mce_ucr_ucna,
536 intel_mce_ucr_srao,
537 intel_mce_ucr_srar,
538 };
540 static enum intel_mce_type intel_check_mce_type(uint64_t status)
541 {
542 if (!(status & MCi_STATUS_VAL))
543 return intel_mce_invalid;
545 if (status & MCi_STATUS_PCC)
546 return intel_mce_fatal;
548 /* Corrected error? */
549 if (!(status & MCi_STATUS_UC))
550 return intel_mce_corrected;
552 if (!ser_support)
553 return intel_mce_fatal;
555 if (status & MCi_STATUS_S)
556 {
557 if (status & MCi_STATUS_AR)
558 {
559 if (status & MCi_STATUS_OVER)
560 return intel_mce_fatal;
561 else
562 return intel_mce_ucr_srar;
563 } else
564 return intel_mce_ucr_srao;
565 }
566 else
567 return intel_mce_ucr_ucna;
569 /* Any type not included abovoe ? */
570 return intel_mce_fatal;
571 }
573 static int is_async_memerr(uint64_t status)
574 {
575 return (status & 0xFFFF) == 0x17A || (status & 0xFFF0) == 0xC0;
576 }
578 struct mcinfo_recovery *mci_add_pageoff_action(int bank, struct mc_info *mi,
579 uint64_t mfn, uint32_t status)
580 {
581 struct mcinfo_recovery *rec;
583 if (!mi)
584 return NULL;
586 rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
587 if (!rec)
588 {
589 mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
590 return NULL;
591 }
593 memset(rec, 0, sizeof(struct mcinfo_recovery));
595 rec->mc_bank = bank;
596 rec->action_types = MC_ACTION_PAGE_OFFLINE;
597 rec->action_info.page_retire.mfn = mfn;
598 rec->action_info.page_retire.status = status;
599 return rec;
600 }
602 static void intel_memerr_dhandler(int bnum,
603 struct mca_binfo *binfo,
604 struct mca_handle_result *result)
605 {
606 struct mcinfo_bank *bank = binfo->mib;
607 struct mcinfo_global *global = binfo->mig;
608 struct domain *d;
609 unsigned long mfn, gfn;
610 uint32_t status;
611 uint64_t mc_status, mc_misc;
613 mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
614 result->result = MCA_NEED_RESET;
616 mc_status = bank->mc_status;
617 mc_misc = bank->mc_misc;
618 if (!(mc_status & MCi_STATUS_ADDRV) ||
619 !(mc_status & MCi_STATUS_MISCV) ||
620 ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
621 {
622 result->result |= MCA_NO_ACTION;
623 dprintk(XENLOG_WARNING,
624 "No physical address provided for memory error\n");
625 return;
626 }
628 mfn = bank->mc_addr >> PAGE_SHIFT;
629 if (offline_page(mfn, 1, &status))
630 {
631 dprintk(XENLOG_WARNING,
632 "Failed to offline page %lx for MCE error\n", mfn);
633 return;
634 }
636 mci_add_pageoff_action(bnum, binfo->mi, mfn, status);
638 /* This is free page */
639 if (status & PG_OFFLINE_OFFLINED)
640 result->result = MCA_RECOVERED;
641 else if (status & PG_OFFLINE_PENDING) {
642 /* This page has owner */
643 if (status & PG_OFFLINE_OWNED) {
644 result->result |= MCA_OWNER;
645 result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
646 mce_printk(MCE_QUIET, "MCE: This error page is ownded"
647 " by DOM %d\n", result->owner);
648 /* Fill vMCE# injection and vMCE# MSR virtualization "
649 * "related data */
650 bank->mc_domid = result->owner;
651 /* XXX: Cannot handle shared pages yet
652 * (this should identify all domains and gfn mapping to
653 * the mfn in question) */
654 BUG_ON( result->owner == DOMID_COW );
655 if ( result->owner != DOMID_XEN ) {
656 d = get_domain_by_id(result->owner);
657 ASSERT(d);
658 gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
660 if ( !is_vmce_ready(bank, d) )
661 {
662 printk("DOM%d not ready for vMCE\n", d->domain_id);
663 goto vmce_failed;
664 }
666 if ( unmmap_broken_page(d, _mfn(mfn), gfn) )
667 {
668 printk("Unmap broken memory %lx for DOM%d failed\n",
669 mfn, d->domain_id);
670 goto vmce_failed;
671 }
673 bank->mc_addr = gfn << PAGE_SHIFT |
674 (bank->mc_addr & (PAGE_SIZE -1 ));
675 if ( fill_vmsr_data(bank, d,
676 global->mc_gstatus) == -1 )
677 {
678 mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
679 "failed\n", result->owner);
680 goto vmce_failed;
681 }
683 /* We will inject vMCE to DOMU*/
684 if ( inject_vmce(d) < 0 )
685 {
686 mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
687 " failed\n", d->domain_id);
688 goto vmce_failed;
689 }
690 /* Impacted domain go on with domain's recovery job
691 * if the domain has its own MCA handler.
692 * For xen, it has contained the error and finished
693 * its own recovery job.
694 */
695 result->result = MCA_RECOVERED;
696 put_domain(d);
698 return;
699 vmce_failed:
700 put_domain(d);
701 domain_crash(d);
702 }
703 }
704 }
705 }
707 static int default_check(uint64_t status)
708 {
709 return 1;
710 }
712 static void intel_default_dhandler(int bnum,
713 struct mca_binfo *binfo,
714 struct mca_handle_result *result)
715 {
716 uint64_t status = binfo->mib->mc_status;
717 enum intel_mce_type type;
719 type = intel_check_mce_type(status);
721 if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
722 result->result = MCA_RESET;
723 else if (type == intel_mce_ucr_srao)
724 result->result = MCA_NO_ACTION;
725 }
727 struct mca_error_handler intel_mce_dhandlers[] =
728 {{is_async_memerr, intel_memerr_dhandler}, {default_check, intel_default_dhandler}};
730 static void intel_default_uhandler(int bnum,
731 struct mca_binfo *binfo,
732 struct mca_handle_result *result)
733 {
734 uint64_t status = binfo->mib->mc_status;
735 enum intel_mce_type type;
737 type = intel_check_mce_type(status);
739 switch (type)
740 {
741 /* Panic if no handler for SRAR error */
742 case intel_mce_ucr_srar:
743 case intel_mce_fatal:
744 result->result = MCA_RESET;
745 break;
746 default:
747 result->result = MCA_NO_ACTION;
748 break;
749 }
750 }
752 struct mca_error_handler intel_mce_uhandlers[] =
753 {{default_check, intel_default_uhandler}};
755 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
756 {
757 uint64_t gstatus;
758 mctelem_cookie_t mctc = NULL;
759 struct mca_summary bs;
760 struct mca_banks *clear_bank;
762 mce_spin_lock(&mce_logout_lock);
764 clear_bank = __get_cpu_var(mce_clear_banks);
765 memset( clear_bank->bank_map, 0x0,
766 sizeof(long) * BITS_TO_LONGS(clear_bank->num));
767 mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs, clear_bank);
769 if (bs.errcnt) {
770 /*
771 * Uncorrected errors must be dealth with in softirq context.
772 */
773 if (bs.uc || bs.pcc) {
774 add_taint(TAINT_MACHINE_CHECK);
775 if (mctc != NULL)
776 mctelem_defer(mctc);
777 /*
778 * For PCC=1 and can't be recovered, context is lost, so reboot now without
779 * clearing the banks, and deal with the telemetry after reboot
780 * (the MSRs are sticky)
781 */
782 if (bs.pcc || !bs.recoverable)
783 cpu_set(smp_processor_id(), mce_fatal_cpus);
784 } else {
785 if (mctc != NULL)
786 mctelem_commit(mctc);
787 }
788 atomic_set(&found_error, 1);
790 /* The last CPU will be take check/clean-up etc */
791 atomic_set(&severity_cpu, smp_processor_id());
793 mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
794 *((unsigned long*)clear_bank), smp_processor_id());
795 mcheck_mca_clearbanks(clear_bank);
796 } else {
797 if (mctc != NULL)
798 mctelem_dismiss(mctc);
799 }
800 mce_spin_unlock(&mce_logout_lock);
802 mce_barrier_enter(&mce_trap_bar);
803 if ( mctc != NULL && mce_urgent_action(regs, mctc))
804 cpu_set(smp_processor_id(), mce_fatal_cpus);
805 mce_barrier_exit(&mce_trap_bar);
806 /*
807 * Wait until everybody has processed the trap.
808 */
809 mce_barrier_enter(&mce_trap_bar);
810 if (atomic_read(&severity_cpu) == smp_processor_id())
811 {
812 /* According to SDM, if no error bank found on any cpus,
813 * something unexpected happening, we can't do any
814 * recovery job but to reset the system.
815 */
816 if (atomic_read(&found_error) == 0)
817 mc_panic("MCE: No CPU found valid MCE, need reset\n");
818 if (!cpus_empty(mce_fatal_cpus))
819 {
820 char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs ";
821 ebufp = ebuf + strlen(ebuf);
822 cpumask_scnprintf(ebufp, 95 - strlen(ebuf), mce_fatal_cpus);
823 mc_panic(ebuf);
824 }
825 atomic_set(&found_error, 0);
826 }
827 mce_barrier_exit(&mce_trap_bar);
829 /* Clear flags after above fatal check */
830 mce_barrier_enter(&mce_trap_bar);
831 gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
832 if ((gstatus & MCG_STATUS_MCIP) != 0) {
833 mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
834 mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
835 }
836 mce_barrier_exit(&mce_trap_bar);
838 raise_softirq(MACHINE_CHECK_SOFTIRQ);
839 }
841 /* According to MCA OS writer guide, CMCI handler need to clear bank when
842 * 1) CE (UC = 0)
843 * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
844 * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
845 * MCA handler need to clear bank when
846 * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
847 * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
848 * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
849 */
851 static int intel_need_clearbank_scan(enum mca_source who, u64 status)
852 {
853 if ( who == MCA_CMCI_HANDLER) {
854 /* CMCI need clear bank */
855 if ( !(status & MCi_STATUS_UC) )
856 return 1;
857 /* Spurious need clear bank */
858 else if ( ser_support && !(status & MCi_STATUS_OVER)
859 && !(status & MCi_STATUS_EN) )
860 return 1;
861 /* UCNA OVER = 0 need clear bank */
862 else if ( ser_support && !(status & MCi_STATUS_OVER)
863 && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
864 && !(status & MCi_STATUS_AR))
865 return 1;
866 /* Only Log, no clear */
867 else return 0;
868 }
869 else if ( who == MCA_MCE_SCAN) {
870 /* Spurious need clear bank */
871 if ( ser_support && !(status & MCi_STATUS_OVER)
872 && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN))
873 return 1;
874 /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
875 else if ( ser_support && (status & MCi_STATUS_UC)
876 && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR )
877 && (status & MCi_STATUS_OVER) )
878 return 1;
879 /* SRAO need clear bank */
880 else if ( ser_support && !(status & MCi_STATUS_AR)
881 && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC))
882 return 1;
883 else
884 return 0;
885 }
887 return 1;
888 }
890 /* MCE continues/is recoverable when
891 * 1) CE UC = 0
892 * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
893 * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
894 * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
895 * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
896 */
897 static int intel_recoverable_scan(u64 status)
898 {
900 if ( !(status & MCi_STATUS_UC ) )
901 return 1;
902 else if ( ser_support && !(status & MCi_STATUS_EN)
903 && !(status & MCi_STATUS_OVER) )
904 return 1;
905 /* SRAR error */
906 else if ( ser_support && !(status & MCi_STATUS_OVER)
907 && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
908 && (status & MCi_STATUS_AR) ) {
909 mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined currently.\n");
910 return 0;
911 }
912 /* SRAO error */
913 else if (ser_support && !(status & MCi_STATUS_PCC)
914 && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
915 && (status & MCi_STATUS_EN))
916 return 1;
917 /* UCNA error */
918 else if (ser_support && !(status & MCi_STATUS_OVER)
919 && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
920 && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR))
921 return 1;
922 return 0;
923 }
925 /* CMCI */
926 static DEFINE_SPINLOCK(cmci_discover_lock);
928 /*
929 * Discover bank sharing using the algorithm recommended in the SDM.
930 */
931 static int do_cmci_discover(int i)
932 {
933 unsigned msr = MSR_IA32_MC0_CTL2 + i;
934 u64 val;
936 rdmsrl(msr, val);
937 /* Some other CPU already owns this bank. */
938 if (val & CMCI_EN) {
939 mcabanks_clear(i, __get_cpu_var(mce_banks_owned));
940 goto out;
941 }
943 val &= ~CMCI_THRESHOLD_MASK;
944 wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
945 rdmsrl(msr, val);
947 if (!(val & CMCI_EN)) {
948 /* This bank does not support CMCI. Polling timer has to handle it. */
949 mcabanks_set(i, __get_cpu_var(no_cmci_banks));
950 return 0;
951 }
952 mcabanks_set(i, __get_cpu_var(mce_banks_owned));
953 out:
954 mcabanks_clear(i, __get_cpu_var(no_cmci_banks));
955 return 1;
956 }
958 static void cmci_discover(void)
959 {
960 unsigned long flags;
961 int i;
962 mctelem_cookie_t mctc;
963 struct mca_summary bs;
965 mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%d\n", smp_processor_id());
967 spin_lock_irqsave(&cmci_discover_lock, flags);
969 for (i = 0; i < nr_mce_banks; i++)
970 if (!mcabanks_test(i, __get_cpu_var(mce_banks_owned)))
971 do_cmci_discover(i);
973 spin_unlock_irqrestore(&cmci_discover_lock, flags);
975 /* In case CMCI happended when do owner change.
976 * If CMCI happened yet not processed immediately,
977 * MCi_status (error_count bit 38~52) is not cleared,
978 * the CMCI interrupt will never be triggered again.
979 */
981 mctc = mcheck_mca_logout(
982 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
984 if (bs.errcnt && mctc != NULL) {
985 if (dom0_vmce_enabled()) {
986 mctelem_commit(mctc);
987 send_guest_global_virq(dom0, VIRQ_MCA);
988 } else {
989 x86_mcinfo_dump(mctelem_dataptr(mctc));
990 mctelem_dismiss(mctc);
991 }
992 } else if (mctc != NULL)
993 mctelem_dismiss(mctc);
995 mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
996 smp_processor_id(),
997 *((unsigned long *)__get_cpu_var(mce_banks_owned)->bank_map),
998 *((unsigned long *)__get_cpu_var(no_cmci_banks)->bank_map));
999 }
1001 /*
1002 * Define an owner for each bank. Banks can be shared between CPUs
1003 * and to avoid reporting events multiple times always set up one
1004 * CPU as owner.
1006 * The assignment has to be redone when CPUs go offline and
1007 * any of the owners goes away. Also pollers run in parallel so we
1008 * have to be careful to update the banks in a way that doesn't
1009 * lose or duplicate events.
1010 */
1012 static void mce_set_owner(void)
1014 if (!cmci_support || mce_disabled == 1)
1015 return;
1017 cmci_discover();
1020 static void __cpu_mcheck_distribute_cmci(void *unused)
1022 cmci_discover();
1025 static void cpu_mcheck_distribute_cmci(void)
1027 if (cmci_support && !mce_disabled)
1028 on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
1031 static void clear_cmci(void)
1033 int i;
1035 if (!cmci_support || mce_disabled == 1)
1036 return;
1038 mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%d\n",
1039 smp_processor_id());
1041 for (i = 0; i < nr_mce_banks; i++) {
1042 unsigned msr = MSR_IA32_MC0_CTL2 + i;
1043 u64 val;
1044 if (!mcabanks_test(i, __get_cpu_var(mce_banks_owned)))
1045 continue;
1046 rdmsrl(msr, val);
1047 if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
1048 wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
1049 mcabanks_clear(i, __get_cpu_var(mce_banks_owned));
1053 static void cpu_mcheck_disable(void)
1055 clear_in_cr4(X86_CR4_MCE);
1057 if (cmci_support && !mce_disabled)
1058 clear_cmci();
1061 static void intel_init_cmci(struct cpuinfo_x86 *c)
1063 u32 l, apic;
1064 int cpu = smp_processor_id();
1066 if (!mce_available(c) || !cmci_support) {
1067 if (opt_cpu_info)
1068 mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
1069 return;
1072 apic = apic_read(APIC_CMCI);
1073 if ( apic & APIC_VECTOR_MASK )
1075 mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
1076 cpu, ( apic & APIC_VECTOR_MASK ));
1077 return;
1080 apic = CMCI_APIC_VECTOR;
1081 apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
1082 apic_write_around(APIC_CMCI, apic);
1084 l = apic_read(APIC_CMCI);
1085 apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
1087 mce_set_owner();
1090 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
1092 mctelem_cookie_t mctc;
1093 struct mca_summary bs;
1094 struct cpu_user_regs *old_regs = set_irq_regs(regs);
1096 ack_APIC_irq();
1097 irq_enter();
1099 mctc = mcheck_mca_logout(
1100 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
1102 if (bs.errcnt && mctc != NULL) {
1103 if (dom0_vmce_enabled()) {
1104 mctelem_commit(mctc);
1105 mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
1106 send_guest_global_virq(dom0, VIRQ_MCA);
1107 } else {
1108 x86_mcinfo_dump(mctelem_dataptr(mctc));
1109 mctelem_dismiss(mctc);
1111 } else if (mctc != NULL)
1112 mctelem_dismiss(mctc);
1114 irq_exit();
1115 set_irq_regs(old_regs);
1118 /* MCA */
1120 static int mce_is_broadcast(struct cpuinfo_x86 *c)
1122 if (mce_force_broadcast)
1123 return 1;
1125 /* According to Intel SDM Dec, 2009, 15.10.4.1, For processors with
1126 * DisplayFamily_DisplayModel encoding of 06H_EH and above,
1127 * a MCA signal is broadcast to all logical processors in the system
1128 */
1129 if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
1130 c->x86_model >= 0xe)
1131 return 1;
1132 return 0;
1135 /* Check and init MCA */
1136 static void intel_init_mca(struct cpuinfo_x86 *c)
1138 bool_t broadcast, cmci = 0, ser = 0;
1139 int ext_num = 0, first;
1140 uint64_t msr_content;
1142 broadcast = mce_is_broadcast(c);
1144 rdmsrl(MSR_IA32_MCG_CAP, msr_content);
1146 if ((msr_content & MCG_CMCI_P) && cpu_has_apic)
1147 cmci = 1;
1149 /* Support Software Error Recovery */
1150 if (msr_content & MCG_SER_P)
1151 ser = 1;
1153 if (msr_content & MCG_EXT_P)
1154 ext_num = (msr_content >> MCG_EXT_CNT) & 0xff;
1156 first = mce_firstbank(c);
1158 if (smp_processor_id() == 0)
1160 dprintk(XENLOG_INFO, "MCA Capability: BCAST %x SER %x"
1161 " CMCI %x firstbank %x extended MCE MSR %x\n",
1162 broadcast, ser, cmci, first, ext_num);
1164 mce_broadcast = broadcast;
1165 cmci_support = cmci;
1166 ser_support = ser;
1167 nr_intel_ext_msrs = ext_num;
1168 firstbank = first;
1170 else if (cmci != cmci_support || ser != ser_support ||
1171 broadcast != mce_broadcast ||
1172 first != firstbank || ext_num != nr_intel_ext_msrs)
1174 dprintk(XENLOG_WARNING,
1175 "CPU %u has different MCA capability (%x,%x,%x,%x,%x)"
1176 " than BSP, may cause undetermined result!!!\n",
1177 smp_processor_id(), broadcast, ser, cmci, first, ext_num);
1181 static void intel_mce_post_reset(void)
1183 mctelem_cookie_t mctc;
1184 struct mca_summary bs;
1186 mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
1188 /* in the boot up stage, print out and also log in DOM0 boot process */
1189 if (bs.errcnt && mctc != NULL) {
1190 x86_mcinfo_dump(mctelem_dataptr(mctc));
1191 mctelem_commit(mctc);
1193 return;
1196 static void intel_init_mce(void)
1198 uint64_t msr_content;
1199 int i;
1201 intel_mce_post_reset();
1203 /* clear all banks */
1204 for (i = firstbank; i < nr_mce_banks; i++)
1206 /* Some banks are shared across cores, use MCi_CTRL to judge whether
1207 * this bank has been initialized by other cores already. */
1208 rdmsrl(MSR_IA32_MCx_CTL(i), msr_content);
1209 if (!msr_content)
1211 /* if ctl is 0, this bank is never initialized */
1212 mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
1213 wrmsrl(MSR_IA32_MCx_CTL(i), 0xffffffffffffffffULL);
1214 wrmsrl(MSR_IA32_MCx_STATUS(i), 0x0ULL);
1217 if (firstbank) /* if cmci enabled, firstbank = 0 */
1218 wrmsrl(MSR_IA32_MC0_STATUS, 0x0ULL);
1220 x86_mce_vector_register(intel_machine_check);
1221 mce_recoverable_register(intel_recoverable_scan);
1222 mce_need_clearbank_register(intel_need_clearbank_scan);
1224 mce_dhandlers = intel_mce_dhandlers;
1225 mce_dhandler_num = sizeof(intel_mce_dhandlers)/sizeof(struct mca_error_handler);
1226 mce_uhandlers = intel_mce_uhandlers;
1227 mce_uhandler_num = sizeof(intel_mce_uhandlers)/sizeof(struct mca_error_handler);
1230 static int intel_init_mca_banks(void)
1232 struct mca_banks *mb1, *mb2, * mb3;
1234 mb1 = mcabanks_alloc();
1235 mb2 = mcabanks_alloc();
1236 mb3 = mcabanks_alloc();
1237 if (!mb1 || !mb2 || !mb3)
1238 goto out;
1240 __get_cpu_var(mce_clear_banks) = mb1;
1241 __get_cpu_var(no_cmci_banks) = mb2;
1242 __get_cpu_var(mce_banks_owned) = mb3;
1244 return 0;
1245 out:
1246 mcabanks_free(mb1);
1247 mcabanks_free(mb2);
1248 mcabanks_free(mb3);
1249 return -ENOMEM;
1252 /* p4/p6 family have similar MCA initialization process */
1253 enum mcheck_type intel_mcheck_init(struct cpuinfo_x86 *c)
1255 if (intel_init_mca_banks())
1256 return mcheck_none;
1258 intel_init_mca(c);
1260 mce_handler_init();
1262 intel_init_mce();
1264 intel_init_cmci(c);
1265 #ifdef CONFIG_X86_MCE_THERMAL
1266 intel_init_thermal(c);
1267 #endif
1269 return mcheck_intel;
1272 /* intel specific MCA MSR */
1273 int intel_mce_wrmsr(uint32_t msr, uint64_t val)
1275 int ret = 0;
1277 if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks))
1279 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1280 "Guest should not write this MSR!\n");
1281 ret = 1;
1284 return ret;
1287 int intel_mce_rdmsr(uint32_t msr, uint64_t *val)
1289 int ret = 0;
1291 if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks))
1293 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1294 "Guest should not read this MSR!\n");
1295 ret = 1;
1298 return ret;
1301 static int cpu_callback(
1302 struct notifier_block *nfb, unsigned long action, void *hcpu)
1304 switch ( action )
1306 case CPU_DYING:
1307 cpu_mcheck_disable();
1308 break;
1309 case CPU_DEAD:
1310 cpu_mcheck_distribute_cmci();
1311 break;
1312 default:
1313 break;
1316 return NOTIFY_DONE;
1319 static struct notifier_block cpu_nfb = {
1320 .notifier_call = cpu_callback
1321 };
1323 static int __init intel_mce_initcall(void)
1325 register_cpu_notifier(&cpu_nfb);
1326 return 0;
1328 presmp_initcall(intel_mce_initcall);