debuggers.hg

view xen/arch/x86/cpu/mcheck/mce_intel.c @ 20998:50ea24db1f88

x86/mcheck: do not blindly de-reference dom0 et al

Since machine checks and CMCIs can happen before Dom0 even gets
constructed, the handlers of these events have to avoid de-referencing
respective pointers without checking.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 17 12:04:50 2010 +0000 (2010-02-17)
parents da7ae6d8838a
children 6384675aa29a
line source
1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/smp.h>
8 #include <xen/mm.h>
9 #include <asm/processor.h>
10 #include <public/sysctl.h>
11 #include <asm/system.h>
12 #include <asm/msr.h>
13 #include <asm/p2m.h>
14 #include "mce.h"
15 #include "x86_mca.h"
17 DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
18 DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
19 int cmci_support = 0;
20 int ser_support = 0;
22 static int nr_intel_ext_msrs = 0;
24 /* Below are for MCE handling */
25 struct mce_softirq_barrier {
26 atomic_t val;
27 atomic_t ingen;
28 atomic_t outgen;
29 };
31 static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar;
32 static struct mce_softirq_barrier mce_trap_bar;
34 /*
35 * mce_logout_lock should only be used in the trap handler,
36 * while MCIP has not been cleared yet in the global status
37 * register. Other use is not safe, since an MCE trap can
38 * happen at any moment, which would cause lock recursion.
39 */
40 static DEFINE_SPINLOCK(mce_logout_lock);
42 static atomic_t severity_cpu = ATOMIC_INIT(-1);
43 static atomic_t found_error = ATOMIC_INIT(0);
45 static void mce_barrier_enter(struct mce_softirq_barrier *);
46 static void mce_barrier_exit(struct mce_softirq_barrier *);
48 #ifdef CONFIG_X86_MCE_THERMAL
49 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
50 {
51 printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
52 smp_processor_id());
53 add_taint(TAINT_MACHINE_CHECK);
54 }
56 /* P4/Xeon Thermal transition interrupt handler */
57 static void intel_thermal_interrupt(struct cpu_user_regs *regs)
58 {
59 u32 l, h;
60 unsigned int cpu = smp_processor_id();
61 static s_time_t next[NR_CPUS];
63 ack_APIC_irq();
64 if (NOW() < next[cpu])
65 return;
67 next[cpu] = NOW() + MILLISECS(5000);
68 rdmsr(MSR_IA32_THERM_STATUS, l, h);
69 if (l & 0x1) {
70 printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
71 printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
72 cpu);
73 add_taint(TAINT_MACHINE_CHECK);
74 } else {
75 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
76 }
77 }
79 /* Thermal interrupt handler for this CPU setup */
80 static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs)
81 = unexpected_thermal_interrupt;
83 fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
84 {
85 struct cpu_user_regs *old_regs = set_irq_regs(regs);
86 irq_enter();
87 vendor_thermal_interrupt(regs);
88 irq_exit();
89 set_irq_regs(old_regs);
90 }
92 /* P4/Xeon Thermal regulation detect and init */
93 static void intel_init_thermal(struct cpuinfo_x86 *c)
94 {
95 u32 l, h;
96 int tm2 = 0;
97 unsigned int cpu = smp_processor_id();
99 /* Thermal monitoring */
100 if (!cpu_has(c, X86_FEATURE_ACPI))
101 return; /* -ENODEV */
103 /* Clock modulation */
104 if (!cpu_has(c, X86_FEATURE_ACC))
105 return; /* -ENODEV */
107 /* first check if its enabled already, in which case there might
108 * be some SMM goo which handles it, so we can't even put a handler
109 * since it might be delivered via SMI already -zwanem.
110 */
111 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
112 h = apic_read(APIC_LVTTHMR);
113 if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
114 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
115 return; /* -EBUSY */
116 }
118 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
119 tm2 = 1;
121 /* check whether a vector already exists, temporarily masked? */
122 if (h & APIC_VECTOR_MASK) {
123 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
124 cpu, (h & APIC_VECTOR_MASK));
125 return; /* -EBUSY */
126 }
128 /* The temperature transition interrupt handler setup */
129 h = THERMAL_APIC_VECTOR; /* our delivery vector */
130 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
131 apic_write_around(APIC_LVTTHMR, h);
133 rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
134 wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
136 /* ok we're good to go... */
137 vendor_thermal_interrupt = intel_thermal_interrupt;
139 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
140 wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
142 l = apic_read (APIC_LVTTHMR);
143 apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
144 printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
145 cpu, tm2 ? "TM2" : "TM1");
146 return;
147 }
148 #endif /* CONFIG_X86_MCE_THERMAL */
150 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
151 {
152 if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
153 && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs ) {
154 ext->mc_msr[ext->mc_msrs].reg = msr;
155 mca_rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
156 ++ext->mc_msrs;
157 }
158 }
160 static enum mca_extinfo
161 intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
162 {
163 struct mcinfo_extended mc_ext;
165 if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
166 return MCA_EXTINFO_IGNORED;
168 /* this function will called when CAP(9).MCG_EXT_P = 1 */
169 memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
170 mc_ext.common.type = MC_TYPE_EXTENDED;
171 mc_ext.common.size = sizeof(mc_ext);
173 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EAX);
174 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EBX);
175 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ECX);
176 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EDX);
177 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ESI);
178 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EDI);
179 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EBP);
180 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_ESP);
181 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EFLAGS);
182 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_EIP);
183 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_MISC);
185 #ifdef __x86_64__
186 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R8);
187 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R9);
188 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R10);
189 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R11);
190 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R12);
191 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R13);
192 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R14);
193 intel_get_extended_msr(&mc_ext, MSR_IA32_MCG_R15);
194 #endif
196 x86_mcinfo_add(mci, &mc_ext);
198 return MCA_EXTINFO_GLOBAL;
199 }
201 /* This node list records errors impacting a domain. when one
202 * MCE# happens, one error bank impacts a domain. This error node
203 * will be inserted to the tail of the per_dom data for vMCE# MSR
204 * virtualization. When one vMCE# injection is finished processing
205 * processed by guest, the corresponding node will be deleted.
206 * This node list is for GUEST vMCE# MSRS virtualization.
207 */
208 static struct bank_entry* alloc_bank_entry(void) {
209 struct bank_entry *entry;
211 entry = xmalloc(struct bank_entry);
212 if (!entry) {
213 printk(KERN_ERR "MCE: malloc bank_entry failed\n");
214 return NULL;
215 }
216 memset(entry, 0x0, sizeof(entry));
217 INIT_LIST_HEAD(&entry->list);
218 return entry;
219 }
221 /* Fill error bank info for #vMCE injection and GUEST vMCE#
222 * MSR virtualization data
223 * 1) Log down how many nr_injections of the impacted.
224 * 2) Copy MCE# error bank to impacted DOM node list,
225 for vMCE# MSRs virtualization
226 */
228 static int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
229 uint64_t gstatus) {
230 struct bank_entry *entry;
232 /* This error bank impacts one domain, we need to fill domain related
233 * data for vMCE MSRs virtualization and vMCE# injection */
234 if (mc_bank->mc_domid != (uint16_t)~0) {
235 /* For HVM guest, Only when first vMCE is consumed by HVM guest successfully,
236 * will we generete another node and inject another vMCE
237 */
238 if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection > 0) )
239 {
240 mce_printk(MCE_QUIET, "MCE: HVM guest has not handled previous"
241 " vMCE yet!\n");
242 return -1;
243 }
244 entry = alloc_bank_entry();
245 if (entry == NULL)
246 return -1;
248 entry->mci_status = mc_bank->mc_status;
249 entry->mci_addr = mc_bank->mc_addr;
250 entry->mci_misc = mc_bank->mc_misc;
251 entry->bank = mc_bank->mc_bank;
253 spin_lock(&d->arch.vmca_msrs.lock);
254 /* New error Node, insert to the tail of the per_dom data */
255 list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
256 /* Fill MSR global status */
257 d->arch.vmca_msrs.mcg_status = gstatus;
258 /* New node impact the domain, need another vMCE# injection*/
259 d->arch.vmca_msrs.nr_injection++;
260 spin_unlock(&d->arch.vmca_msrs.lock);
262 mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d "
263 "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
264 mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr,
265 mc_bank->mc_domid);
266 }
267 return 0;
268 }
270 static int inject_mce(struct domain *d)
271 {
272 int cpu = smp_processor_id();
273 cpumask_t affinity;
275 /* PV guest and HVM guest have different vMCE# injection
276 * methods*/
278 if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
279 {
280 if (d->is_hvm)
281 {
282 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM DOM %d\n",
283 d->domain_id);
284 vcpu_kick(d->vcpu[0]);
285 }
286 /* PV guest including DOM0 */
287 else
288 {
289 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
290 d->domain_id);
291 if (guest_has_trap_callback
292 (d, 0, TRAP_machine_check))
293 {
294 d->vcpu[0]->cpu_affinity_tmp =
295 d->vcpu[0]->cpu_affinity;
296 cpus_clear(affinity);
297 cpu_set(cpu, affinity);
298 mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity, old %d\n", cpu,
299 d->vcpu[0]->processor);
300 vcpu_set_affinity(d->vcpu[0], &affinity);
301 vcpu_kick(d->vcpu[0]);
302 }
303 else
304 {
305 mce_printk(MCE_VERBOSE, "MCE: Kill PV guest with No MCE handler\n");
306 domain_crash(d);
307 }
308 }
309 }
310 else {
311 /* new vMCE comes while first one has not been injected yet,
312 * in this case, inject fail. [We can't lose this vMCE for
313 * the mce node's consistency].
314 */
315 mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected "
316 " to this DOM%d!\n", d->domain_id);
317 return -1;
318 }
319 return 0;
320 }
322 static void intel_UCR_handler(struct mcinfo_bank *bank,
323 struct mcinfo_global *global,
324 struct mcinfo_extended *extension,
325 struct mca_handle_result *result)
326 {
327 struct domain *d;
328 unsigned long mfn, gfn;
329 uint32_t status;
331 mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
332 result->result = MCA_NEED_RESET;
333 if (bank->mc_addr != 0) {
334 mfn = bank->mc_addr >> PAGE_SHIFT;
335 if (!offline_page(mfn, 1, &status)) {
336 /* This is free page */
337 if (status & PG_OFFLINE_OFFLINED)
338 result->result = MCA_RECOVERED;
339 else if (status & PG_OFFLINE_PENDING) {
340 /* This page has owner */
341 if (status & PG_OFFLINE_OWNED) {
342 result->result |= MCA_OWNER;
343 result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
344 mce_printk(MCE_QUIET, "MCE: This error page is ownded"
345 " by DOM %d\n", result->owner);
346 /* Fill vMCE# injection and vMCE# MSR virtualization "
347 * "related data */
348 bank->mc_domid = result->owner;
349 /* XXX: Cannot handle shared pages yet
350 * (this should identify all domains and gfn mapping to
351 * the mfn in question) */
352 BUG_ON( result->owner == DOMID_COW );
353 if ( result->owner != DOMID_XEN ) {
355 d = get_domain_by_id(result->owner);
356 if ( mca_ctl_conflict(bank, d) )
357 {
358 /* Guest has different MCE ctl with hypervisor */
359 if ( d )
360 put_domain(d);
361 return;
362 }
364 ASSERT(d);
365 gfn =
366 get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
367 bank->mc_addr = gfn << PAGE_SHIFT |
368 (bank->mc_addr & (PAGE_SIZE -1 ));
369 if ( fill_vmsr_data(bank, d,
370 global->mc_gstatus) == -1 )
371 {
372 mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
373 "failed\n", result->owner);
374 put_domain(d);
375 domain_crash(d);
376 return;
377 }
378 /* We will inject vMCE to DOMU*/
379 if ( inject_mce(d) < 0 )
380 {
381 mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
382 " failed\n", d->domain_id);
383 put_domain(d);
384 domain_crash(d);
385 return;
386 }
387 /* Impacted domain go on with domain's recovery job
388 * if the domain has its own MCA handler.
389 * For xen, it has contained the error and finished
390 * its own recovery job.
391 */
392 result->result = MCA_RECOVERED;
393 put_domain(d);
394 }
395 }
396 }
397 }
398 }
399 }
401 #define INTEL_MAX_RECOVERY 2
402 struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] =
403 {{0x017A, intel_UCR_handler}, {0x00C0, intel_UCR_handler}};
405 /*
406 * Called from mctelem_process_deferred. Return 1 if the telemetry
407 * should be committed for dom0 consumption, 0 if it should be
408 * dismissed.
409 */
410 static int mce_action(mctelem_cookie_t mctc)
411 {
412 struct mc_info *local_mi;
413 uint32_t i;
414 struct mcinfo_common *mic = NULL;
415 struct mcinfo_global *mc_global;
416 struct mcinfo_bank *mc_bank;
417 struct mca_handle_result mca_res;
419 local_mi = (struct mc_info*)mctelem_dataptr(mctc);
420 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
421 if (mic == NULL) {
422 printk(KERN_ERR "MCE: get local buffer entry failed\n ");
423 return 0;
424 }
426 mc_global = (struct mcinfo_global *)mic;
428 /* Processing bank information */
429 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
431 for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
432 if (mic->type != MC_TYPE_BANK) {
433 continue;
434 }
435 mc_bank = (struct mcinfo_bank*)mic;
437 /* TODO: Add recovery actions here, such as page-offline, etc */
438 memset(&mca_res, 0x0f, sizeof(mca_res));
439 for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
440 if ( ((mc_bank->mc_status & 0xffff) ==
441 intel_recovery_handler[i].mca_code) ||
442 ((mc_bank->mc_status & 0xfff0) ==
443 intel_recovery_handler[i].mca_code)) {
444 /* For SRAR, OVER = 1 should have caused reset
445 * For SRAO, OVER = 1 skip recovery action, continue execution
446 */
447 if (!(mc_bank->mc_status & MCi_STATUS_OVER))
448 intel_recovery_handler[i].recovery_handler
449 (mc_bank, mc_global, NULL, &mca_res);
450 else {
451 if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
452 mca_res.result = MCA_NEED_RESET;
453 else
454 mca_res.result = MCA_NO_ACTION;
455 }
456 if (mca_res.result & MCA_OWNER)
457 mc_bank->mc_domid = mca_res.owner;
458 if (mca_res.result == MCA_NEED_RESET)
459 /* DOMID_XEN*/
460 mc_panic("MCE: Software recovery failed for the UCR "
461 "error\n");
462 else if (mca_res.result == MCA_RECOVERED)
463 mce_printk(MCE_VERBOSE, "MCE: The UCR error is"
464 "successfully recovered by software!\n");
465 else if (mca_res.result == MCA_NO_ACTION)
466 mce_printk(MCE_VERBOSE, "MCE: Overwrite SRAO error can't"
467 "do recover action, RIPV=1, let it be.\n");
468 break;
469 }
470 }
471 /* For SRAR, no defined recovery action should have caused reset
472 * in MCA Handler
473 */
474 if ( i >= INTEL_MAX_RECOVERY )
475 mce_printk(MCE_VERBOSE, "MCE: No software recovery action"
476 " found for this SRAO error\n");
478 }
479 return 1;
480 }
482 /* Softirq Handler for this MCE# processing */
483 static void mce_softirq(void)
484 {
485 int cpu = smp_processor_id();
486 unsigned int workcpu;
488 mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
490 mce_barrier_enter(&mce_inside_bar);
492 /*
493 * Everybody is here. Now let's see who gets to do the
494 * recovery work. Right now we just see if there's a CPU
495 * that did not have any problems, and pick that one.
496 *
497 * First, just set a default value: the last CPU who reaches this
498 * will overwrite the value and become the default.
499 */
501 atomic_set(&severity_cpu, cpu);
503 mce_barrier_enter(&mce_severity_bar);
504 if (!mctelem_has_deferred(cpu))
505 atomic_set(&severity_cpu, cpu);
506 mce_barrier_exit(&mce_severity_bar);
508 /* We choose severity_cpu for further processing */
509 if (atomic_read(&severity_cpu) == cpu) {
511 mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
513 /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
514 * vMCE MSRs virtualization buffer
515 */
516 for_each_online_cpu(workcpu) {
517 mctelem_process_deferred(workcpu, mce_action);
518 }
520 /* Step2: Send Log to DOM0 through vIRQ */
521 if (dom0_vmce_enabled()) {
522 mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
523 send_guest_global_virq(dom0, VIRQ_MCA);
524 }
525 }
527 mce_barrier_exit(&mce_inside_bar);
528 }
530 /* Machine Check owner judge algorithm:
531 * When error happens, all cpus serially read its msr banks.
532 * The first CPU who fetches the error bank's info will clear
533 * this bank. Later readers can't get any infor again.
534 * The first CPU is the actual mce_owner
535 *
536 * For Fatal (pcc=1) error, it might cause machine crash
537 * before we're able to log. For avoiding log missing, we adopt two
538 * round scanning:
539 * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
540 * All MCE banks are sticky, when boot up, MCE polling mechanism
541 * will help to collect and log those MCE errors.
542 * Round2: Do all MCE processing logic as normal.
543 */
545 static void mce_panic_check(void)
546 {
547 if (is_mc_panic) {
548 local_irq_enable();
549 for ( ; ; )
550 halt();
551 }
552 }
554 /*
555 * Initialize a barrier. Just set it to 0.
556 */
557 static void mce_barrier_init(struct mce_softirq_barrier *bar)
558 {
559 atomic_set(&bar->val, 0);
560 atomic_set(&bar->ingen, 0);
561 atomic_set(&bar->outgen, 0);
562 }
564 #if 0
565 /*
566 * This function will need to be used when offlining a CPU in the
567 * recovery actions.
568 *
569 * Decrement a barrier only. Needed for cases where the CPU
570 * in question can't do it itself (e.g. it is being offlined).
571 */
572 static void mce_barrier_dec(struct mce_softirq_barrier *bar)
573 {
574 atomic_inc(&bar->outgen);
575 wmb();
576 atomic_dec(&bar->val);
577 }
578 #endif
580 static void mce_spin_lock(spinlock_t *lk)
581 {
582 while (!spin_trylock(lk)) {
583 cpu_relax();
584 mce_panic_check();
585 }
586 }
588 static void mce_spin_unlock(spinlock_t *lk)
589 {
590 spin_unlock(lk);
591 }
593 /*
594 * Increment the generation number and the value. The generation number
595 * is incremented when entering a barrier. This way, it can be checked
596 * on exit if a CPU is trying to re-enter the barrier. This can happen
597 * if the first CPU to make it out immediately exits or re-enters, while
598 * another CPU that is still in the loop becomes otherwise occupied
599 * (e.g. it needs to service an interrupt, etc), missing the value
600 * it's waiting for.
601 *
602 * These barrier functions should always be paired, so that the
603 * counter value will reach 0 again after all CPUs have exited.
604 */
605 static void mce_barrier_enter(struct mce_softirq_barrier *bar)
606 {
607 int gen;
609 if (!mce_broadcast)
610 return;
611 atomic_inc(&bar->ingen);
612 gen = atomic_read(&bar->outgen);
613 mb();
614 atomic_inc(&bar->val);
615 while ( atomic_read(&bar->val) != num_online_cpus() &&
616 atomic_read(&bar->outgen) == gen) {
617 mb();
618 mce_panic_check();
619 }
620 }
622 static void mce_barrier_exit(struct mce_softirq_barrier *bar)
623 {
624 int gen;
626 if (!mce_broadcast)
627 return;
628 atomic_inc(&bar->outgen);
629 gen = atomic_read(&bar->ingen);
630 mb();
631 atomic_dec(&bar->val);
632 while ( atomic_read(&bar->val) != 0 &&
633 atomic_read(&bar->ingen) == gen ) {
634 mb();
635 mce_panic_check();
636 }
637 }
639 #if 0
640 static void mce_barrier(struct mce_softirq_barrier *bar)
641 {
642 mce_barrier_enter(bar);
643 mce_barrier_exit(bar);
644 }
645 #endif
647 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
648 {
649 uint64_t gstatus;
650 mctelem_cookie_t mctc = NULL;
651 struct mca_summary bs;
652 cpu_banks_t clear_bank;
654 mce_spin_lock(&mce_logout_lock);
656 memset( &clear_bank, 0x0, sizeof(cpu_banks_t));
657 mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs, &clear_bank);
659 if (bs.errcnt) {
660 /* dump MCE error */
661 if (mctc != NULL)
662 x86_mcinfo_dump(mctelem_dataptr(mctc));
664 /*
665 * Uncorrected errors must be dealth with in softirq context.
666 */
667 if (bs.uc || bs.pcc) {
668 add_taint(TAINT_MACHINE_CHECK);
669 if (mctc != NULL)
670 mctelem_defer(mctc);
671 /*
672 * For PCC=1 and can't be recovered, context is lost, so reboot now without
673 * clearing the banks, and deal with the telemetry after reboot
674 * (the MSRs are sticky)
675 */
676 if (bs.pcc)
677 mc_panic("State lost due to machine check exception.\n");
678 if (!bs.ripv)
679 mc_panic("RIPV =0 can't resume execution!\n");
680 if (!bs.recoverable)
681 mc_panic("Machine check exception software recovery fail.\n");
682 } else {
683 if (mctc != NULL)
684 mctelem_commit(mctc);
685 }
686 atomic_set(&found_error, 1);
688 mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
689 *((unsigned long*)clear_bank), smp_processor_id());
690 mcheck_mca_clearbanks(clear_bank);
691 } else {
692 if (mctc != NULL)
693 mctelem_dismiss(mctc);
694 }
695 mce_spin_unlock(&mce_logout_lock);
697 /*
698 * Wait until everybody has processed the trap.
699 */
700 mce_barrier_enter(&mce_trap_bar);
701 /* According to latest MCA OS writer guide, if no error bank found
702 * on all cpus, something unexpected happening, we can't do any
703 * recovery job but to reset the system.
704 */
705 if (atomic_read(&found_error) == 0)
706 mc_panic("Unexpected condition for the MCE handler, need reset\n");
707 mce_barrier_exit(&mce_trap_bar);
709 /* Clear error finding flags after all cpus finishes above judgement */
710 mce_barrier_enter(&mce_trap_bar);
711 if (atomic_read(&found_error)) {
712 mce_printk(MCE_CRITICAL, "MCE: Choose one CPU "
713 "to clear error finding flag\n ");
714 atomic_set(&found_error, 0);
715 }
716 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
717 if ((gstatus & MCG_STATUS_MCIP) != 0) {
718 mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
719 mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
720 }
721 mce_barrier_exit(&mce_trap_bar);
723 raise_softirq(MACHINE_CHECK_SOFTIRQ);
724 }
726 /* According to MCA OS writer guide, CMCI handler need to clear bank when
727 * 1) CE (UC = 0)
728 * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
729 * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
730 * MCA handler need to clear bank when
731 * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
732 * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
733 * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
734 */
736 static int intel_need_clearbank_scan(enum mca_source who, u64 status)
737 {
738 if ( who == MCA_CMCI_HANDLER) {
739 /* CMCI need clear bank */
740 if ( !(status & MCi_STATUS_UC) )
741 return 1;
742 /* Spurious need clear bank */
743 else if ( ser_support && !(status & MCi_STATUS_OVER)
744 && !(status & MCi_STATUS_EN) )
745 return 1;
746 /* UCNA OVER = 0 need clear bank */
747 else if ( ser_support && !(status & MCi_STATUS_OVER)
748 && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
749 && !(status & MCi_STATUS_AR))
750 return 1;
751 /* Only Log, no clear */
752 else return 0;
753 }
754 else if ( who == MCA_MCE_SCAN) {
755 /* Spurious need clear bank */
756 if ( ser_support && !(status & MCi_STATUS_OVER)
757 && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN))
758 return 1;
759 /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
760 else if ( ser_support && (status & MCi_STATUS_UC)
761 && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR )
762 && (status & MCi_STATUS_OVER) )
763 return 1;
764 /* SRAO need clear bank */
765 else if ( ser_support && !(status & MCi_STATUS_AR)
766 && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC))
767 return 1;
768 else
769 return 0;
770 }
772 return 1;
773 }
775 /* MCE continues/is recoverable when
776 * 1) CE UC = 0
777 * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
778 * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
779 * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
780 * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
781 */
782 static int intel_recoverable_scan(u64 status)
783 {
785 if ( !(status & MCi_STATUS_UC ) )
786 return 1;
787 else if ( ser_support && !(status & MCi_STATUS_EN)
788 && !(status & MCi_STATUS_OVER) )
789 return 1;
790 /* SRAR error */
791 else if ( ser_support && !(status & MCi_STATUS_OVER)
792 && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
793 && (status & MCi_STATUS_AR) ) {
794 mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined currently.\n");
795 return 0;
796 }
797 /* SRAO error */
798 else if (ser_support && !(status & MCi_STATUS_PCC)
799 && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
800 && (status & MCi_STATUS_EN))
801 return 1;
802 /* UCNA error */
803 else if (ser_support && !(status & MCi_STATUS_OVER)
804 && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
805 && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR))
806 return 1;
807 return 0;
808 }
810 static DEFINE_SPINLOCK(cmci_discover_lock);
812 /*
813 * Discover bank sharing using the algorithm recommended in the SDM.
814 */
815 static int do_cmci_discover(int i)
816 {
817 unsigned msr = MSR_IA32_MC0_CTL2 + i;
818 u64 val;
820 rdmsrl(msr, val);
821 /* Some other CPU already owns this bank. */
822 if (val & CMCI_EN) {
823 clear_bit(i, __get_cpu_var(mce_banks_owned));
824 goto out;
825 }
826 wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
827 rdmsrl(msr, val);
829 if (!(val & CMCI_EN)) {
830 /* This bank does not support CMCI. Polling timer has to handle it. */
831 set_bit(i, __get_cpu_var(no_cmci_banks));
832 return 0;
833 }
834 set_bit(i, __get_cpu_var(mce_banks_owned));
835 out:
836 clear_bit(i, __get_cpu_var(no_cmci_banks));
837 return 1;
838 }
840 static void cmci_discover(void)
841 {
842 unsigned long flags;
843 int i;
844 mctelem_cookie_t mctc;
845 struct mca_summary bs;
847 mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%d\n", smp_processor_id());
849 spin_lock_irqsave(&cmci_discover_lock, flags);
851 for (i = 0; i < nr_mce_banks; i++)
852 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
853 do_cmci_discover(i);
855 spin_unlock_irqrestore(&cmci_discover_lock, flags);
857 /* In case CMCI happended when do owner change.
858 * If CMCI happened yet not processed immediately,
859 * MCi_status (error_count bit 38~52) is not cleared,
860 * the CMCI interrupt will never be triggered again.
861 */
863 mctc = mcheck_mca_logout(
864 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
866 if (bs.errcnt && mctc != NULL) {
867 if (dom0_vmce_enabled()) {
868 mctelem_commit(mctc);
869 send_guest_global_virq(dom0, VIRQ_MCA);
870 } else {
871 x86_mcinfo_dump(mctelem_dataptr(mctc));
872 mctelem_dismiss(mctc);
873 }
874 } else if (mctc != NULL)
875 mctelem_dismiss(mctc);
877 mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
878 smp_processor_id(),
879 *((unsigned long *)__get_cpu_var(mce_banks_owned)),
880 *((unsigned long *)__get_cpu_var(no_cmci_banks)));
881 }
883 /*
884 * Define an owner for each bank. Banks can be shared between CPUs
885 * and to avoid reporting events multiple times always set up one
886 * CPU as owner.
887 *
888 * The assignment has to be redone when CPUs go offline and
889 * any of the owners goes away. Also pollers run in parallel so we
890 * have to be careful to update the banks in a way that doesn't
891 * lose or duplicate events.
892 */
894 static void mce_set_owner(void)
895 {
896 if (!cmci_support || mce_disabled == 1)
897 return;
899 cmci_discover();
900 }
902 static void __cpu_mcheck_distribute_cmci(void *unused)
903 {
904 cmci_discover();
905 }
907 void cpu_mcheck_distribute_cmci(void)
908 {
909 if (cmci_support && !mce_disabled)
910 on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
911 }
913 static void clear_cmci(void)
914 {
915 int i;
917 if (!cmci_support || mce_disabled == 1)
918 return;
920 mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%d\n",
921 smp_processor_id());
923 for (i = 0; i < nr_mce_banks; i++) {
924 unsigned msr = MSR_IA32_MC0_CTL2 + i;
925 u64 val;
926 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
927 continue;
928 rdmsrl(msr, val);
929 if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
930 wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
931 clear_bit(i, __get_cpu_var(mce_banks_owned));
932 }
933 }
935 void cpu_mcheck_disable(void)
936 {
937 clear_in_cr4(X86_CR4_MCE);
939 if (cmci_support && !mce_disabled)
940 clear_cmci();
941 }
943 static void intel_init_cmci(struct cpuinfo_x86 *c)
944 {
945 u32 l, apic;
946 int cpu = smp_processor_id();
948 if (!mce_available(c) || !cmci_support) {
949 mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
950 return;
951 }
953 apic = apic_read(APIC_CMCI);
954 if ( apic & APIC_VECTOR_MASK )
955 {
956 mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
957 cpu, ( apic & APIC_VECTOR_MASK ));
958 return;
959 }
961 apic = CMCI_APIC_VECTOR;
962 apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
963 apic_write_around(APIC_CMCI, apic);
965 l = apic_read(APIC_CMCI);
966 apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
967 }
969 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
970 {
971 mctelem_cookie_t mctc;
972 struct mca_summary bs;
973 struct cpu_user_regs *old_regs = set_irq_regs(regs);
975 ack_APIC_irq();
976 irq_enter();
978 mctc = mcheck_mca_logout(
979 MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
981 if (bs.errcnt && mctc != NULL) {
982 if (dom0_vmce_enabled()) {
983 mctelem_commit(mctc);
984 mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
985 send_guest_global_virq(dom0, VIRQ_MCA);
986 } else {
987 x86_mcinfo_dump(mctelem_dataptr(mctc));
988 mctelem_dismiss(mctc);
989 }
990 } else if (mctc != NULL)
991 mctelem_dismiss(mctc);
993 irq_exit();
994 set_irq_regs(old_regs);
995 }
997 void mce_intel_feature_init(struct cpuinfo_x86 *c)
998 {
1000 #ifdef CONFIG_X86_MCE_THERMAL
1001 intel_init_thermal(c);
1002 #endif
1003 intel_init_cmci(c);
1006 static void _mce_cap_init(struct cpuinfo_x86 *c)
1008 u32 l = mce_cap_init();
1010 if ((l & MCG_CMCI_P) && cpu_has_apic)
1011 cmci_support = 1;
1013 /* Support Software Error Recovery */
1014 if (l & MCG_SER_P)
1015 ser_support = 1;
1017 if (l & MCG_EXT_P)
1019 nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff;
1020 mce_printk (MCE_QUIET, "CPU%d: Intel Extended MCE MSRs (%d) available\n",
1021 smp_processor_id(), nr_intel_ext_msrs);
1023 firstbank = mce_firstbank(c);
1026 static void mce_init(void)
1028 u32 l, h;
1029 int i;
1030 mctelem_cookie_t mctc;
1031 struct mca_summary bs;
1033 clear_in_cr4(X86_CR4_MCE);
1035 mce_barrier_init(&mce_inside_bar);
1036 mce_barrier_init(&mce_severity_bar);
1037 mce_barrier_init(&mce_trap_bar);
1038 spin_lock_init(&mce_logout_lock);
1040 /* log the machine checks left over from the previous reset.
1041 * This also clears all registers*/
1043 mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
1045 /* in the boot up stage, print out and also log in DOM0 boot process */
1046 if (bs.errcnt && mctc != NULL) {
1047 x86_mcinfo_dump(mctelem_dataptr(mctc));
1048 mctelem_commit(mctc);
1051 set_in_cr4(X86_CR4_MCE);
1053 for (i = firstbank; i < nr_mce_banks; i++)
1055 /* Some banks are shared across cores, use MCi_CTRL to judge whether
1056 * this bank has been initialized by other cores already. */
1057 rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h);
1058 if (!(l | h))
1060 /* if ctl is 0, this bank is never initialized */
1061 mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
1062 wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff);
1063 wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0);
1066 if (firstbank) /* if cmci enabled, firstbank = 0 */
1067 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
1070 /* p4/p6 family have similar MCA initialization process */
1071 int intel_mcheck_init(struct cpuinfo_x86 *c)
1073 _mce_cap_init(c);
1074 mce_printk(MCE_QUIET, "Intel machine check reporting enabled on CPU#%d.\n",
1075 smp_processor_id());
1077 /* machine check is available */
1078 x86_mce_vector_register(intel_machine_check);
1079 x86_mce_callback_register(intel_get_extended_msrs);
1080 mce_recoverable_register(intel_recoverable_scan);
1081 mce_need_clearbank_register(intel_need_clearbank_scan);
1083 mce_init();
1084 mce_intel_feature_init(c);
1085 mce_set_owner();
1087 open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
1088 return 1;
1091 int intel_mce_wrmsr(uint32_t msr, uint64_t val)
1093 int ret = 1;
1095 switch ( msr )
1097 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MC0_CTL2 + MAX_NR_BANKS - 1:
1098 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1099 "Guest should not write this MSR!\n");
1100 break;
1101 default:
1102 ret = 0;
1103 break;
1106 return ret;
1109 int intel_mce_rdmsr(uint32_t msr, uint64_t *val)
1111 int ret = 1;
1113 switch ( msr )
1115 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MC0_CTL2 + MAX_NR_BANKS - 1:
1116 mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
1117 "Guest should not read this MSR!\n");
1118 break;
1119 default:
1120 ret = 0;
1121 break;
1124 return ret;