debuggers.hg

view xen/arch/x86/cpu/mcheck/mce_intel.c @ 20959:5b895c3f4386

Dump machine check context for fatal machine check

This small patches enable Xen hypervisor to always dump machine check
ontext, previously it will not print anything if fatal MCE happens. It
also add checking for NULL pointer.

It also change the address passing to guest to always use guest
mfn. It should benifit non-translated guest.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Feb 08 10:18:51 2010 +0000 (2010-02-08)
parents ebd2495ec073
children da7ae6d8838a
line source
1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/smp.h>
8 #include <xen/mm.h>
9 #include <asm/processor.h>
10 #include <public/sysctl.h>
11 #include <asm/system.h>
12 #include <asm/msr.h>
13 #include <asm/p2m.h>
14 #include "mce.h"
15 #include "x86_mca.h"
17 DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
18 DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
19 int cmci_support = 0;
20 int ser_support = 0;
22 static int nr_intel_ext_msrs = 0;
24 /* Below are for MCE handling */
25 struct mce_softirq_barrier {
26 atomic_t val;
27 atomic_t ingen;
28 atomic_t outgen;
29 };
31 static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar;
32 static struct mce_softirq_barrier mce_trap_bar;
34 /*
35 * mce_logout_lock should only be used in the trap handler,
36 * while MCIP has not been cleared yet in the global status
37 * register. Other use is not safe, since an MCE trap can
38 * happen at any moment, which would cause lock recursion.
39 */
40 static DEFINE_SPINLOCK(mce_logout_lock);
42 static atomic_t severity_cpu = ATOMIC_INIT(-1);
43 static atomic_t found_error = ATOMIC_INIT(0);
45 static void mce_barrier_enter(struct mce_softirq_barrier *);
46 static void mce_barrier_exit(struct mce_softirq_barrier *);
48 #ifdef CONFIG_X86_MCE_THERMAL
49 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
50 {
51 printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
52 smp_processor_id());
53 add_taint(TAINT_MACHINE_CHECK);
54 }
56 /* P4/Xeon Thermal transition interrupt handler */
57 static void intel_thermal_interrupt(struct cpu_user_regs *regs)
58 {
59 u32 l, h;
60 unsigned int cpu = smp_processor_id();
61 static s_time_t next[NR_CPUS];
63 ack_APIC_irq();
64 if (NOW() < next[cpu])
65 return;
67 next[cpu] = NOW() + MILLISECS(5000);
68 rdmsr(MSR_IA32_THERM_STATUS, l, h);
69 if (l & 0x1) {
70 printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
71 printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
72 cpu);
73 add_taint(TAINT_MACHINE_CHECK);
74 } else {
75 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
76 }
77 }
79 /* Thermal interrupt handler for this CPU setup */
80 static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs)
81 = unexpected_thermal_interrupt;
83 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
84 {
85 struct cpu_user_regs *old_regs = set_irq_regs(regs);
86 irq_enter();
87 vendor_thermal_interrupt(regs);
88 irq_exit();
89 set_irq_regs(old_regs);
90 }
92 /* P4/Xeon Thermal regulation detect and init */
93 static void intel_init_thermal(struct cpuinfo_x86 *c)
94 {
95 u32 l, h;
96 int tm2 = 0;
97 unsigned int cpu = smp_processor_id();
99 /* Thermal monitoring */
100 if (!cpu_has(c, X86_FEATURE_ACPI))
101 return; /* -ENODEV */
103 /* Clock modulation */
104 if (!cpu_has(c, X86_FEATURE_ACC))
105 return; /* -ENODEV */
107 /* first check if its enabled already, in which case there might
108 * be some SMM goo which handles it, so we can't even put a handler
109 * since it might be delivered via SMI already -zwanem.
110 */
111 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
112 h = apic_read(APIC_LVTTHMR);
113 if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
114 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
115 return; /* -EBUSY */
116 }
118 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
119 tm2 = 1;
121 /* check whether a vector already exists, temporarily masked? */
122 if (h & APIC_VECTOR_MASK) {
123 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
124 cpu, (h & APIC_VECTOR_MASK));
125 return; /* -EBUSY */
126 }
128 /* The temperature transition interrupt handler setup */
129 h = THERMAL_APIC_VECTOR; /* our delivery vector */
130 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
131 apic_write_around(APIC_LVTTHMR, h);
133 rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
134 wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
136 /* ok we're good to go... */
137 vendor_thermal_interrupt = intel_thermal_interrupt;
139 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
140 wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
142 l = apic_read (APIC_LVTTHMR);
143 apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
144 printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
145 cpu, tm2 ? "TM2" : "TM1");
146 return;
147 }
148 #endif /* CONFIG_X86_MCE_THERMAL */
150 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
151 {
152 if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
153 && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs ) {
154 ext->mc_msr[ext->mc_msrs].reg = msr;
155 mca_rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
156 ++ext->mc_msrs;
157 }
158 }
160 static enum mca_extinfo
161 intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
162 {
163 struct mcinfo_extended mc_ext;
165 if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
166 return MCA_EXTINFO_IGNORED;
168 /* this function will called when CAP(9).MCG_EXT_P = 1 */
169 memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
170 mc_ext.common.type = MC_TYPE_EXTENDED;
171 mc_ext.common.size = sizeof(mc_ext);
173 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EAX);
174 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EBX);
175 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ECX);
176 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EDX);
177 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ESI);
178 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EDI);
179 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EBP);
180 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ESP);
181 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EFLAGS);
182 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EIP);
183 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_MISC);
185 #ifdef __x86_64__
186 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R8);
187 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R9);
188 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R10);
189 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R11);
190 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R12);
191 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R13);
192 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R14);
193 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R15);
194 #endif
196 x86_mcinfo_add(mci, &mc_ext);
198 return MCA_EXTINFO_GLOBAL;
199 }
201 /* This node list records errors impacting a domain. when one
202 * MCE# happens, one error bank impacts a domain. This error node
203 * will be inserted to the tail of the per_dom data for vMCE# MSR
204 * virtualization. When one vMCE# injection is finished processing
205 * processed by guest, the corresponding node will be deleted.
206 * This node list is for GUEST vMCE# MSRS virtualization.
207 */
208 static struct bank_entry* alloc_bank_entry(void) {
209 struct bank_entry *entry;
211 entry = xmalloc(struct bank_entry);
212 if (!entry) {
213 printk(KERN_ERR "MCE: malloc bank_entry failed\n");
214 return NULL;
215 }
216 memset(entry, 0x0, sizeof(entry));
217 INIT_LIST_HEAD(&entry->list);
218 return entry;
219 }
221 /* Fill error bank info for #vMCE injection and GUEST vMCE#
222 * MSR virtualization data
223 * 1) Log down how many nr_injections of the impacted.
224 * 2) Copy MCE# error bank to impacted DOM node list,
225 for vMCE# MSRs virtualization
226 */
228 static int fill_vmsr_data(struct mcinfo_bank *mc_bank,
229 uint64_t gstatus) {
230 struct domain *d;
231 struct bank_entry *entry;
233 /* This error bank impacts one domain, we need to fill domain related
234 * data for vMCE MSRs virtualization and vMCE# injection */
235 if (mc_bank->mc_domid != (uint16_t)~0) {
236 d = get_domain_by_id(mc_bank->mc_domid);
238 /* Not impact a valid domain, skip this error of the bank */
239 if (!d) {
240 mce_printk(MCE_QUIET, "MCE: Not found valid impacted DOM\n");
241 return 0;
242 }
244 /* For HVM guest, Only when first vMCE is consumed by HVM guest successfully,
245 * will we generete another node and inject another vMCE
246 */
247 if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection > 0) )
248 {
249 mce_printk(MCE_QUIET, "MCE: HVM guest has not handled previous"
250 " vMCE yet!\n");
251 return -1;
252 }
253 entry = alloc_bank_entry();
254 if (entry == NULL)
255 return -1;
257 entry->mci_status = mc_bank->mc_status;
258 entry->mci_addr = mc_bank->mc_addr;
259 entry->mci_misc = mc_bank->mc_misc;
260 entry->bank = mc_bank->mc_bank;
262 spin_lock(&d->arch.vmca_msrs.lock);
263 /* New error Node, insert to the tail of the per_dom data */
264 list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
265 /* Fill MSR global status */
266 d->arch.vmca_msrs.mcg_status = gstatus;
267 /* New node impact the domain, need another vMCE# injection*/
268 d->arch.vmca_msrs.nr_injection++;
269 spin_unlock(&d->arch.vmca_msrs.lock);
271 mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d "
272 "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
273 mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr,
274 mc_bank->mc_domid);
275 }
276 return 0;
277 }
279 static int inject_mce(struct domain *d)
280 {
281 int cpu = smp_processor_id();
282 cpumask_t affinity;
284 /* PV guest and HVM guest have different vMCE# injection
285 * methods*/
287 if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
288 {
289 if (d->is_hvm)
290 {
291 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM DOM %d\n",
292 d->domain_id);
293 vcpu_kick(d->vcpu[0]);
294 }
295 /* PV guest including DOM0 */
296 else
297 {
298 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
299 d->domain_id);
300 if (guest_has_trap_callback
301 (d, 0, TRAP_machine_check))
302 {
303 d->vcpu[0]->cpu_affinity_tmp =
304 d->vcpu[0]->cpu_affinity;
305 cpus_clear(affinity);
306 cpu_set(cpu, affinity);
307 mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity, old %d\n", cpu,
308 d->vcpu[0]->processor);
309 vcpu_set_affinity(d->vcpu[0], &affinity);
310 vcpu_kick(d->vcpu[0]);
311 }
312 else
313 {
314 mce_printk(MCE_VERBOSE, "MCE: Kill PV guest with No MCE handler\n");
315 domain_crash(d);
316 }
317 }
318 }
319 else {
320 /* new vMCE comes while first one has not been injected yet,
321 * in this case, inject fail. [We can't lose this vMCE for
322 * the mce node's consistency].
323 */
324 mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected "
325 " to this DOM%d!\n", d->domain_id);
326 return -1;
327 }
328 return 0;
329 }
331 static void intel_UCR_handler(struct mcinfo_bank *bank,
332 struct mcinfo_global *global,
333 struct mcinfo_extended *extension,
334 struct mca_handle_result *result)
335 {
336 struct domain *d;
337 unsigned long mfn, gfn;
338 uint32_t status;
340 mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
341 result->result = MCA_NEED_RESET;
342 if (bank->mc_addr != 0) {
343 mfn = bank->mc_addr >> PAGE_SHIFT;
344 if (!offline_page(mfn, 1, &status)) {
345 /* This is free page */
346 if (status & PG_OFFLINE_OFFLINED)
347 result->result = MCA_RECOVERED;
348 else if (status & PG_OFFLINE_PENDING) {
349 /* This page has owner */
350 if (status & PG_OFFLINE_OWNED) {
351 result->result |= MCA_OWNER;
352 result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
353 mce_printk(MCE_QUIET, "MCE: This error page is ownded"
354 " by DOM %d\n", result->owner);
355 /* Fill vMCE# injection and vMCE# MSR virtualization "
356 * "related data */
357 bank->mc_domid = result->owner;
358 /* XXX: Cannot handle shared pages yet
359 * (this should identify all domains and gfn mapping to
360 * the mfn in question) */
361 BUG_ON( result->owner == DOMID_COW );
362 if ( result->owner != DOMID_XEN ) {
364 d = get_domain_by_id(result->owner);
365 if ( mca_ctl_conflict(bank, d) )
366 {
367 /* Guest has different MCE ctl with hypervisor */
368 put_domain(d);
369 return;
370 }
372 gfn =
373 get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
374 bank->mc_addr = gfn << PAGE_SHIFT |
375 (bank->mc_addr & (PAGE_SIZE -1 ));
376 if (fill_vmsr_data(bank, global->mc_gstatus) == -1)
377 {
378 mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
379 "failed\n", result->owner);
380 put_domain(d);
381 domain_crash(d);
382 return;
383 }
384 /* We will inject vMCE to DOMU*/
385 if ( inject_mce(d) < 0 )
386 {
387 mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
388 " failed\n", d->domain_id);
389 put_domain(d);
390 domain_crash(d);
391 return;
392 }
393 /* Impacted domain go on with domain's recovery job
394 * if the domain has its own MCA handler.
395 * For xen, it has contained the error and finished
396 * its own recovery job.
397 */
398 result->result = MCA_RECOVERED;
399 put_domain(d);
400 }
401 }
402 }
403 }
404 }
405 }
407 #define INTEL_MAX_RECOVERY 2
408 struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] =
409 {{0x017A, intel_UCR_handler}, {0x00C0, intel_UCR_handler}};
411 /*
412 * Called from mctelem_process_deferred. Return 1 if the telemetry
413 * should be committed for dom0 consumption, 0 if it should be
414 * dismissed.
415 */
416 static int mce_action(mctelem_cookie_t mctc)
417 {
418 struct mc_info *local_mi;
419 uint32_t i;
420 struct mcinfo_common *mic = NULL;
421 struct mcinfo_global *mc_global;
422 struct mcinfo_bank *mc_bank;
423 struct mca_handle_result mca_res;
425 local_mi = (struct mc_info*)mctelem_dataptr(mctc);
426 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
427 if (mic == NULL) {
428 printk(KERN_ERR "MCE: get local buffer entry failed\n ");
429 return 0;
430 }
432 mc_global = (struct mcinfo_global *)mic;
434 /* Processing bank information */
435 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
437 for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
438 if (mic->type != MC_TYPE_BANK) {
439 continue;
440 }
441 mc_bank = (struct mcinfo_bank*)mic;
443 /* TODO: Add recovery actions here, such as page-offline, etc */
444 memset(&mca_res, 0x0f, sizeof(mca_res));
445 for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
446 if ( ((mc_bank->mc_status & 0xffff) ==
447 intel_recovery_handler[i].mca_code) ||
448 ((mc_bank->mc_status & 0xfff0) ==
449 intel_recovery_handler[i].mca_code)) {
450 /* For SRAR, OVER = 1 should have caused reset
451 * For SRAO, OVER = 1 skip recovery action, continue execution
452 */
453 if (!(mc_bank->mc_status & MCi_STATUS_OVER))
454 intel_recovery_handler[i].recovery_handler
455 (mc_bank, mc_global, NULL, &mca_res);
456 else {
457 if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
458 mca_res.result = MCA_NEED_RESET;
459 else
460 mca_res.result = MCA_NO_ACTION;
461 }
462 if (mca_res.result & MCA_OWNER)
463 mc_bank->mc_domid = mca_res.owner;
464 if (mca_res.result == MCA_NEED_RESET)
465 /* DOMID_XEN*/
466 mc_panic("MCE: Software recovery failed for the UCR "
467 "error\n");
468 else if (mca_res.result == MCA_RECOVERED)
469 mce_printk(MCE_VERBOSE, "MCE: The UCR error is"
470 "successfully recovered by software!\n");
471 else if (mca_res.result == MCA_NO_ACTION)
472 mce_printk(MCE_VERBOSE, "MCE: Overwrite SRAO error can't"
473 "do recover action, RIPV=1, let it be.\n");
474 break;
475 }
476 }
477 /* For SRAR, no defined recovery action should have caused reset
478 * in MCA Handler
479 */
480 if ( i >= INTEL_MAX_RECOVERY )
481 mce_printk(MCE_VERBOSE, "MCE: No software recovery action"
482 " found for this SRAO error\n");
484 }
485 return 1;
486 }
488 /* Softirq Handler for this MCE# processing */
489 static void mce_softirq(void)
490 {
491 int cpu = smp_processor_id();
492 unsigned int workcpu;
494 mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
496 mce_barrier_enter(&mce_inside_bar);
498 /*
499 * Everybody is here. Now let's see who gets to do the
500 * recovery work. Right now we just see if there's a CPU
501 * that did not have any problems, and pick that one.
502 *
503 * First, just set a default value: the last CPU who reaches this
504 * will overwrite the value and become the default.
505 */
507 atomic_set(&severity_cpu, cpu);
509 mce_barrier_enter(&mce_severity_bar);
510 if (!mctelem_has_deferred(cpu))
511 atomic_set(&severity_cpu, cpu);
512 mce_barrier_exit(&mce_severity_bar);
514 /* We choose severity_cpu for further processing */
515 if (atomic_read(&severity_cpu) == cpu) {
517 mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
519 /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
520 * vMCE MSRs virtualization buffer
521 */
522 for_each_online_cpu(workcpu) {
523 mctelem_process_deferred(workcpu, mce_action);
524 }
526 /* Step2: Send Log to DOM0 through vIRQ */
527 if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
528 mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
529 send_guest_global_virq(dom0, VIRQ_MCA);
530 }
531 }
533 mce_barrier_exit(&mce_inside_bar);
534 }
536 /* Machine Check owner judge algorithm:
537 * When error happens, all cpus serially read its msr banks.
538 * The first CPU who fetches the error bank's info will clear
539 * this bank. Later readers can't get any infor again.
540 * The first CPU is the actual mce_owner
541 *
542 * For Fatal (pcc=1) error, it might cause machine crash
543 * before we're able to log. For avoiding log missing, we adopt two
544 * round scanning:
545 * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
546 * All MCE banks are sticky, when boot up, MCE polling mechanism
547 * will help to collect and log those MCE errors.
548 * Round2: Do all MCE processing logic as normal.
549 */
551 static void mce_panic_check(void)
552 {
553 if (is_mc_panic) {
554 local_irq_enable();
555 for ( ; ; )
556 halt();
557 }
558 }
560 /*
561 * Initialize a barrier. Just set it to 0.
562 */
563 static void mce_barrier_init(struct mce_softirq_barrier *bar)
564 {
565 atomic_set(&bar->val, 0);
566 atomic_set(&bar->ingen, 0);
567 atomic_set(&bar->outgen, 0);
568 }
570 #if 0
571 /*
572 * This function will need to be used when offlining a CPU in the
573 * recovery actions.
574 *
575 * Decrement a barrier only. Needed for cases where the CPU
576 * in question can't do it itself (e.g. it is being offlined).
577 */
578 static void mce_barrier_dec(struct mce_softirq_barrier *bar)
579 {
580 atomic_inc(&bar->outgen);
581 wmb();
582 atomic_dec(&bar->val);
583 }
584 #endif
586 static void mce_spin_lock(spinlock_t *lk)
587 {
588 while (!spin_trylock(lk)) {
589 cpu_relax();
590 mce_panic_check();
591 }
592 }
594 static void mce_spin_unlock(spinlock_t *lk)
595 {
596 spin_unlock(lk);
597 }
599 /*
600 * Increment the generation number and the value. The generation number
601 * is incremented when entering a barrier. This way, it can be checked
602 * on exit if a CPU is trying to re-enter the barrier. This can happen
603 * if the first CPU to make it out immediately exits or re-enters, while
604 * another CPU that is still in the loop becomes otherwise occupied
605 * (e.g. it needs to service an interrupt, etc), missing the value
606 * it's waiting for.
607 *
608 * These barrier functions should always be paired, so that the
609 * counter value will reach 0 again after all CPUs have exited.
610 */
611 static void mce_barrier_enter(struct mce_softirq_barrier *bar)
612 {
613 int gen;
615 if (!mce_broadcast)
616 return;
617 atomic_inc(&bar->ingen);
618 gen = atomic_read(&bar->outgen);
619 mb();
620 atomic_inc(&bar->val);
621 while ( atomic_read(&bar->val) != num_online_cpus() &&
622 atomic_read(&bar->outgen) == gen) {
623 mb();
624 mce_panic_check();
625 }
626 }
628 static void mce_barrier_exit(struct mce_softirq_barrier *bar)
629 {
630 int gen;
632 if (!mce_broadcast)
633 return;
634 atomic_inc(&bar->outgen);
635 gen = atomic_read(&bar->ingen);
636 mb();
637 atomic_dec(&bar->val);
638 while ( atomic_read(&bar->val) != 0 &&
639 atomic_read(&bar->ingen) == gen ) {
640 mb();
641 mce_panic_check();
642 }
643 }
645 #if 0
646 static void mce_barrier(struct mce_softirq_barrier *bar)
647 {
648 mce_barrier_enter(bar);
649 mce_barrier_exit(bar);
650 }
651 #endif
653 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
654 {
655 uint64_t gstatus;
656 mctelem_cookie_t mctc = NULL;
657 struct mca_summary bs;
658 cpu_banks_t clear_bank;
660 mce_spin_lock(&mce_logout_lock);
662 memset( &clear_bank, 0x0, sizeof(cpu_banks_t));
663 mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs, &clear_bank);
665 if (bs.errcnt) {
666 /* dump MCE error */
667 if (mctc != NULL)
668 x86_mcinfo_dump(mctelem_dataptr(mctc));
670 /*
671 * Uncorrected errors must be dealth with in softirq context.
672 */
673 if (bs.uc || bs.pcc) {
674 add_taint(TAINT_MACHINE_CHECK);
675 if (mctc != NULL)
676 mctelem_defer(mctc);
677 /*
678 * For PCC=1 and can't be recovered, context is lost, so reboot now without
679 * clearing the banks, and deal with the telemetry after reboot
680 * (the MSRs are sticky)
681 */
682 if (bs.pcc)
683 mc_panic("State lost due to machine check exception.\n");
684 if (!bs.ripv)
685 mc_panic("RIPV =0 can't resume execution!\n");
686 if (!bs.recoverable)
687 mc_panic("Machine check exception software recovery fail.\n");
688 } else {
689 if (mctc != NULL)
690 mctelem_commit(mctc);
691 }
692 atomic_set(&found_error, 1);
694 mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
695 *((unsigned long*)clear_bank), smp_processor_id());
696 mcheck_mca_clearbanks(clear_bank);
697 } else {
698 if (mctc != NULL)
699 mctelem_dismiss(mctc);
700 }
701 mce_spin_unlock(&mce_logout_lock);
703 /*
704 * Wait until everybody has processed the trap.
705 */
706 mce_barrier_enter(&mce_trap_bar);
707 /* According to latest MCA OS writer guide, if no error bank found
708 * on all cpus, something unexpected happening, we can't do any
709 * recovery job but to reset the system.
710 */
711 if (atomic_read(&found_error) == 0)
712 mc_panic("Unexpected condition for the MCE handler, need reset\n");
713 mce_barrier_exit(&mce_trap_bar);
715 /* Clear error finding flags after all cpus finishes above judgement */
716 mce_barrier_enter(&mce_trap_bar);
717 if (atomic_read(&found_error)) {
718 mce_printk(MCE_CRITICAL, "MCE: Choose one CPU "
719 "to clear error finding flag\n ");
720 atomic_set(&found_error, 0);
721 }
722 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
723 if ((gstatus & MCG_STATUS_MCIP) != 0) {
724 mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
725 mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
726 }
727 mce_barrier_exit(&mce_trap_bar);
729 raise_softirq(MACHINE_CHECK_SOFTIRQ);
730 }
732 /* According to MCA OS writer guide, CMCI handler need to clear bank when
733 * 1) CE (UC = 0)
734 * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
735 * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
736 * MCA handler need to clear bank when
737 * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
738 * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
739 * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
740 */
742 static int intel_need_clearbank_scan(enum mca_source who, u64 status)
743 {
744 if ( who == MCA_CMCI_HANDLER) {
745 /* CMCI need clear bank */
746 if ( !(status & MCi_STATUS_UC) )
747 return 1;
748 /* Spurious need clear bank */
749 else if ( ser_support && !(status & MCi_STATUS_OVER)
750 && !(status & MCi_STATUS_EN) )
751 return 1;
752 /* UCNA OVER = 0 need clear bank */
753 else if ( ser_support && !(status & MCi_STATUS_OVER)
754 && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
755 && !(status & MCi_STATUS_AR))
756 return 1;
757 /* Only Log, no clear */
758 else return 0;
759 }
760 else if ( who == MCA_MCE_SCAN) {
761 /* Spurious need clear bank */
762 if ( ser_support && !(status & MCi_STATUS_OVER)
763 && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN))
764 return 1;
765 /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
766 else if ( ser_support && (status & MCi_STATUS_UC)
767 && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR )
768 && (status & MCi_STATUS_OVER) )
769 return 1;
770 /* SRAO need clear bank */
771 else if ( ser_support && !(status & MCi_STATUS_AR)
772 && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC))
773 return 1;
774 else
775 return 0;
776 }
778 return 1;
779 }
781 /* MCE continues/is recoverable when
782 * 1) CE UC = 0
783 * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
784 * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
785 * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
786 * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
787 */
788 static int intel_recoverable_scan(u64 status)
789 {
791 if ( !(status & MCi_STATUS_UC ) )
792 return 1;
793 else if ( ser_support && !(status & MCi_STATUS_EN)
794 && !(status & MCi_STATUS_OVER) )
795 return 1;
796 /* SRAR error */
797 else if ( ser_support && !(status & MCi_STATUS_OVER)
798 && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
799 && (status & MCi_STATUS_AR) ) {
800 mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined currently.\n");
801 return 0;
802 }
803 /* SRAO error */
804 else if (ser_support && !(status & MCi_STATUS_PCC)
805 && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
806 && (status & MCi_STATUS_EN))
807 return 1;
808 /* UCNA error */
809 else if (ser_support && !(status & MCi_STATUS_OVER)
810 && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
811 && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR))
812 return 1;
813 return 0;
814 }
816 static DEFINE_SPINLOCK(cmci_discover_lock);
818 /*
819 * Discover bank sharing using the algorithm recommended in the SDM.
820 */
821 static int do_cmci_discover(int i)
822 {
823 unsigned msr = MSR_IA32_MC0_CTL2 + i;
824 u64 val;
826 rdmsrl(msr, val);
827 /* Some other CPU already owns this bank. */
828 if (val & CMCI_EN) {
829 clear_bit(i, __get_cpu_var(mce_banks_owned));
830 goto out;
831 }
832 wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
833 rdmsrl(msr, val);
835 if (!(val & CMCI_EN)) {
836 /* This bank does not support CMCI. Polling timer has to handle it. */
837 set_bit(i, __get_cpu_var(no_cmci_banks));
838 return 0;
839 }
840 set_bit(i, __get_cpu_var(mce_banks_owned));
841 out:
842 clear_bit(i, __get_cpu_var(no_cmci_banks));
843 return 1;
844 }
846 static void cmci_discover(void)
847 {
848 unsigned long flags;
849 int i;
850 mctelem_cookie_t mctc;
851 struct mca_summary bs;
853 mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%d\n", smp_processor_id());
855 spin_lock_irqsave(&cmci_discover_lock, flags);
857 for (i = 0; i < nr_mce_banks; i++)
858 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
859 do_cmci_discover(i);
861 spin_unlock_irqrestore(&cmci_discover_lock, flags);
863 /* In case CMCI happended when do owner change.
864 * If CMCI happened yet not processed immediately,
865 * MCi_status (error_count bit 38~52) is not cleared,
866 * the CMCI interrupt will never be triggered again.
867 */
869 mctc = mcheck_mca_logout(
870 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
872 if (bs.errcnt && mctc != NULL) {
873 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
874 mctelem_commit(mctc);
875 send_guest_global_virq(dom0, VIRQ_MCA);
876 } else {
877 x86_mcinfo_dump(mctelem_dataptr(mctc));
878 mctelem_dismiss(mctc);
879 }
880 } else if (mctc != NULL)
881 mctelem_dismiss(mctc);
883 mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
884 smp_processor_id(),
885 *((unsigned long *)__get_cpu_var(mce_banks_owned)),
886 *((unsigned long *)__get_cpu_var(no_cmci_banks)));
887 }
889 /*
890 * Define an owner for each bank. Banks can be shared between CPUs
891 * and to avoid reporting events multiple times always set up one
892 * CPU as owner.
893 *
894 * The assignment has to be redone when CPUs go offline and
895 * any of the owners goes away. Also pollers run in parallel so we
896 * have to be careful to update the banks in a way that doesn't
897 * lose or duplicate events.
898 */
900 static void mce_set_owner(void)
901 {
902 if (!cmci_support || mce_disabled == 1)
903 return;
905 cmci_discover();
906 }
908 static void __cpu_mcheck_distribute_cmci(void *unused)
909 {
910 cmci_discover();
911 }
913 void cpu_mcheck_distribute_cmci(void)
914 {
915 if (cmci_support && !mce_disabled)
916 on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
917 }
919 static void clear_cmci(void)
920 {
921 int i;
923 if (!cmci_support || mce_disabled == 1)
924 return;
926 mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%d\n",
927 smp_processor_id());
929 for (i = 0; i < nr_mce_banks; i++) {
930 unsigned msr = MSR_IA32_MC0_CTL2 + i;
931 u64 val;
932 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
933 continue;
934 rdmsrl(msr, val);
935 if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
936 wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
937 clear_bit(i, __get_cpu_var(mce_banks_owned));
938 }
939 }
941 void cpu_mcheck_disable(void)
942 {
943 clear_in_cr4(X86_CR4_MCE);
945 if (cmci_support && !mce_disabled)
946 clear_cmci();
947 }
949 static void intel_init_cmci(struct cpuinfo_x86 *c)
950 {
951 u32 l, apic;
952 int cpu = smp_processor_id();
954 if (!mce_available(c) || !cmci_support) {
955 mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
956 return;
957 }
959 apic = apic_read(APIC_CMCI);
960 if ( apic & APIC_VECTOR_MASK )
961 {
962 mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
963 cpu, ( apic & APIC_VECTOR_MASK ));
964 return;
965 }
967 apic = CMCI_APIC_VECTOR;
968 apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
969 apic_write_around(APIC_CMCI, apic);
971 l = apic_read(APIC_CMCI);
972 apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
973 }
975 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
976 {
977 mctelem_cookie_t mctc;
978 struct mca_summary bs;
979 struct cpu_user_regs *old_regs = set_irq_regs(regs);
981 ack_APIC_irq();
982 irq_enter();
984 mctc = mcheck_mca_logout(
985 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
987 if (bs.errcnt && mctc != NULL) {
988 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
989 mctelem_commit(mctc);
990 mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
991 send_guest_global_virq(dom0, VIRQ_MCA);
992 } else {
993 x86_mcinfo_dump(mctelem_dataptr(mctc));
994 mctelem_dismiss(mctc);
995 }
996 } else if (mctc != NULL)
997 mctelem_dismiss(mctc);
999 irq_exit();
1000 set_irq_regs(old_regs);
1003 void mce_intel_feature_init(struct cpuinfo_x86 *c)
1006 #ifdef CONFIG_X86_MCE_THERMAL
1007 intel_init_thermal(c);
1008 #endif
1009 intel_init_cmci(c);
1012 static void _mce_cap_init(struct cpuinfo_x86 *c)
1014 u32 l = mce_cap_init();
1016 if ((l & MCG_CMCI_P) && cpu_has_apic)
1017 cmci_support = 1;
1019 /* Support Software Error Recovery */
1020 if (l & MCG_SER_P)
1021 ser_support = 1;
1023 if (l & MCG_EXT_P)
1025 nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff;
1026 mce_printk (MCE_QUIET, "CPU%d: Intel Extended MCE MSRs (%d) available\n",
1027 smp_processor_id(), nr_intel_ext_msrs);
1029 firstbank = mce_firstbank(c);
1032 static void mce_init(void)
1034 u32 l, h;
1035 int i;
1036 mctelem_cookie_t mctc;
1037 struct mca_summary bs;
1039 clear_in_cr4(X86_CR4_MCE);
1041 mce_barrier_init(&mce_inside_bar);
1042 mce_barrier_init(&mce_severity_bar);
1043 mce_barrier_init(&mce_trap_bar);
1044 spin_lock_init(&mce_logout_lock);
1046 /* log the machine checks left over from the previous reset.
1047 * This also clears all registers*/
1049 mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
1051 /* in the boot up stage, print out and also log in DOM0 boot process */
1052 if (bs.errcnt && mctc != NULL) {
1053 x86_mcinfo_dump(mctelem_dataptr(mctc));
1054 mctelem_commit(mctc);
1057 set_in_cr4(X86_CR4_MCE);
1059 for (i = firstbank; i < nr_mce_banks; i++)
1061 /* Some banks are shared across cores, use MCi_CTRL to judge whether
1062 * this bank has been initialized by other cores already. */
1063 rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h);
1064 if (!(l | h))
1066 /* if ctl is 0, this bank is never initialized */
1067 mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
1068 wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff);
1069 wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0);
1072 if (firstbank) /* if cmci enabled, firstbank = 0 */
1073 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
1076 /* p4/p6 family have similar MCA initialization process */
1077 int intel_mcheck_init(struct cpuinfo_x86 *c)
1079 _mce_cap_init(c);
1080 mce_printk(MCE_QUIET, "Intel machine check reporting enabled on CPU#%d.\n",
1081 smp_processor_id());
1083 /* machine check is available */
1084 x86_mce_vector_register(intel_machine_check);
1085 x86_mce_callback_register(intel_get_extended_msrs);
1086 mce_recoverable_register(intel_recoverable_scan);
1087 mce_need_clearbank_register(intel_need_clearbank_scan);
1089 mce_init();
1090 mce_intel_feature_init(c);
1091 mce_set_owner();
1093 open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
1094 return 1;
1097 int intel_mce_wrmsr(uint32_t msr, uint64_t val)
1099 int ret = 1;
1101 switch ( msr )
1103 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MC0_CTL2 + MAX_NR_BANKS - 1:
1104 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1105 "Guest should not write this MSR!\n");
1106 break;
1107 default:
1108 ret = 0;
1109 break;
1112 return ret;
1115 int intel_mce_rdmsr(uint32_t msr, uint64_t *val)
1117 int ret = 1;
1119 switch ( msr )
1121 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MC0_CTL2 + MAX_NR_BANKS - 1:
1122 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1123 "Guest should not read this MSR!\n");
1124 break;
1125 default:
1126 ret = 0;
1127 break;
1130 return ret;