debuggers.hg

view xen/arch/x86/cpu/mcheck/mce.c @ 20911:088f1b01d852

x86 mca: Add MCE broadcast checkiing.

Some platform will broadcast MCE to all logical processor, while some
platform will not. Distinguish these platforms will be helpful for
unified MCA handler.

the "mce_fb" is a option to emulate the broadcast MCA in non-broadcast
platform. This is mainly for MCA software trigger.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jan 29 06:49:42 2010 +0000 (2010-01-29)
parents 7310235f74f8
children ebd2495ec073
line source
1 /*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4 */
6 #include <xen/init.h>
7 #include <xen/types.h>
8 #include <xen/kernel.h>
9 #include <xen/config.h>
10 #include <xen/smp.h>
11 #include <xen/errno.h>
12 #include <xen/console.h>
13 #include <xen/sched.h>
14 #include <xen/sched-if.h>
15 #include <xen/cpumask.h>
16 #include <xen/event.h>
17 #include <xen/guest_access.h>
18 #include <xen/hypercall.h> /* for do_mca */
20 #include <asm/processor.h>
21 #include <asm/system.h>
22 #include <asm/msr.h>
24 #include "mce.h"
26 int mce_disabled;
27 invbool_param("mce", mce_disabled);
28 static int mce_force_broadcast;
29 boolean_param("mce_fb", mce_force_broadcast);
30 int is_mc_panic;
31 unsigned int nr_mce_banks;
33 int mce_broadcast = 0;
34 static uint64_t g_mcg_cap;
36 /* Real value in physical CTL MSR */
37 static uint64_t h_mcg_ctl = 0UL;
38 static uint64_t *h_mci_ctrl;
39 int firstbank;
41 static void intpose_init(void);
42 static void mcinfo_clear(struct mc_info *);
44 #define SEG_PL(segsel) ((segsel) & 0x3)
45 #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16)
47 #if 0
48 static int x86_mcerr(const char *msg, int err)
49 {
50 gdprintk(XENLOG_WARNING, "x86_mcerr: %s, returning %d\n",
51 msg != NULL ? msg : "", err);
52 return err;
53 }
54 #else
55 #define x86_mcerr(msg, err) (err)
56 #endif
58 cpu_banks_t mca_allbanks;
60 int mce_verbosity;
61 static void __init mce_set_verbosity(char *str)
62 {
63 if (strcmp("verbose", str) == 0)
64 mce_verbosity = MCE_VERBOSE;
65 else
66 printk(KERN_DEBUG "Machine Check verbosity level %s not recognised"
67 "use mce_verbosity=verbose", str);
68 }
69 custom_param("mce_verbosity", mce_set_verbosity);
71 /* Handle unconfigured int18 (should never happen) */
72 static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
73 {
74 printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
75 smp_processor_id());
76 }
79 static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
81 void x86_mce_vector_register(x86_mce_vector_t hdlr)
82 {
83 _machine_check_vector = hdlr;
84 wmb();
85 }
87 /* Call the installed machine check handler for this CPU setup. */
89 void machine_check_vector(struct cpu_user_regs *regs, long error_code)
90 {
91 _machine_check_vector(regs, error_code);
92 }
94 /* Init machine check callback handler
95 * It is used to collect additional information provided by newer
96 * CPU families/models without the need to duplicate the whole handler.
97 * This avoids having many handlers doing almost nearly the same and each
98 * with its own tweaks ands bugs. */
99 static x86_mce_callback_t mc_callback_bank_extended = NULL;
101 void x86_mce_callback_register(x86_mce_callback_t cbfunc)
102 {
103 mc_callback_bank_extended = cbfunc;
104 }
106 /* Machine check recoverable judgement callback handler
107 * It is used to judge whether an UC error is recoverable by software
108 */
109 static mce_recoverable_t mc_recoverable_scan = NULL;
111 void mce_recoverable_register(mce_recoverable_t cbfunc)
112 {
113 mc_recoverable_scan = cbfunc;
114 }
116 /* Judging whether to Clear Machine Check error bank callback handler
117 * According to Intel latest MCA OS Recovery Writer's Guide,
118 * whether the error MCA bank needs to be cleared is decided by the mca_source
119 * and MCi_status bit value.
120 */
121 static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
123 void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
124 {
125 mc_need_clearbank_scan = cbfunc;
126 }
128 /* Utility function to perform MCA bank telemetry readout and to push that
129 * telemetry towards an interested dom0 for logging and diagnosis.
130 * The caller - #MC handler or MCA poll function - must arrange that we
131 * do not migrate cpus. */
133 /* XXFM Could add overflow counting? */
135 /* Add out_param clear_bank for Machine Check Handler Caller.
136 * For Intel latest CPU, whether to clear the error bank status needs to
137 * be judged by the callback function defined above.
138 */
139 mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
140 struct mca_summary *sp, cpu_banks_t* clear_bank)
141 {
142 struct vcpu *v = current;
143 struct domain *d;
144 uint64_t gstatus, status, addr, misc;
145 struct mcinfo_global mcg; /* on stack */
146 struct mcinfo_common *mic;
147 struct mcinfo_global *mig; /* on stack */
148 mctelem_cookie_t mctc = NULL;
149 uint32_t uc = 0, pcc = 0, recover, need_clear = 1 ;
150 struct mc_info *mci = NULL;
151 mctelem_class_t which = MC_URGENT; /* XXXgcc */
152 unsigned int cpu_nr;
153 int errcnt = 0;
154 int i;
155 enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
157 cpu_nr = smp_processor_id();
158 BUG_ON(cpu_nr != v->processor);
160 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
162 memset(&mcg, 0, sizeof (mcg));
163 mcg.common.type = MC_TYPE_GLOBAL;
164 mcg.common.size = sizeof (mcg);
165 if (v != NULL && ((d = v->domain) != NULL)) {
166 mcg.mc_domid = d->domain_id;
167 mcg.mc_vcpuid = v->vcpu_id;
168 } else {
169 mcg.mc_domid = -1;
170 mcg.mc_vcpuid = -1;
171 }
172 mcg.mc_gstatus = gstatus; /* MCG_STATUS */
174 switch (who) {
175 case MCA_MCE_HANDLER:
176 case MCA_MCE_SCAN:
177 mcg.mc_flags = MC_FLAG_MCE;
178 which = MC_URGENT;
179 break;
181 case MCA_POLLER:
182 case MCA_RESET:
183 mcg.mc_flags = MC_FLAG_POLLED;
184 which = MC_NONURGENT;
185 break;
187 case MCA_CMCI_HANDLER:
188 mcg.mc_flags = MC_FLAG_CMCI;
189 which = MC_NONURGENT;
190 break;
192 default:
193 BUG();
194 }
196 /* Retrieve detector information */
197 x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
198 &mcg.mc_coreid, &mcg.mc_core_threadid,
199 &mcg.mc_apicid, NULL, NULL, NULL);
201 /* If no mc_recovery_scan callback handler registered,
202 * this error is not recoverable
203 */
204 recover = (mc_recoverable_scan)? 1: 0;
206 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
207 struct mcinfo_bank mcb; /* on stack */
209 /* Skip bank if corresponding bit in bankmask is clear */
210 if (!test_bit(i, bankmask))
211 continue;
213 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
214 if (!(status & MCi_STATUS_VAL))
215 continue; /* this bank has no valid telemetry */
217 /* For Intel Latest CPU CMCI/MCE Handler caller, we need to
218 * decide whether to clear bank by MCi_STATUS bit value such as
219 * OVER/UC/EN/PCC/S/AR
220 */
221 if ( mc_need_clearbank_scan )
222 need_clear = mc_need_clearbank_scan(who, status);
224 /* If this is the first bank with valid MCA DATA, then
225 * try to reserve an entry from the urgent/nonurgent queue
226 * depending on whethere we are called from an exception or
227 * a poller; this can fail (for example dom0 may not
228 * yet have consumed past telemetry). */
229 if (errcnt == 0) {
230 if ((mctc = mctelem_reserve(which)) != NULL) {
231 mci = mctelem_dataptr(mctc);
232 mcinfo_clear(mci);
233 }
234 }
236 memset(&mcb, 0, sizeof (mcb));
237 mcb.common.type = MC_TYPE_BANK;
238 mcb.common.size = sizeof (mcb);
239 mcb.mc_bank = i;
240 mcb.mc_status = status;
242 /* form a mask of which banks have logged uncorrected errors */
243 if ((status & MCi_STATUS_UC) != 0)
244 uc |= (1 << i);
246 /* likewise for those with processor context corrupt */
247 if ((status & MCi_STATUS_PCC) != 0)
248 pcc |= (1 << i);
250 if (recover && uc)
251 /* uc = 1, recover = 1, we need not panic.
252 */
253 recover = mc_recoverable_scan(status);
255 addr = misc = 0;
257 if (status & MCi_STATUS_ADDRV) {
258 mca_rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
259 if (mfn_valid(paddr_to_pfn(addr))) {
260 d = maddr_get_owner(addr);
261 if (d != NULL && (who == MCA_POLLER ||
262 who == MCA_CMCI_HANDLER))
263 mcb.mc_domid = d->domain_id;
264 }
265 }
267 if (status & MCi_STATUS_MISCV)
268 mca_rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
270 mcb.mc_addr = addr;
271 mcb.mc_misc = misc;
273 if (who == MCA_CMCI_HANDLER) {
274 mca_rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
275 rdtscll(mcb.mc_tsc);
276 }
278 /* Increment the error count; if this is the first bank
279 * with a valid error then add the global info to the mcinfo. */
280 if (errcnt++ == 0 && mci != NULL)
281 x86_mcinfo_add(mci, &mcg);
283 /* Add the bank data */
284 if (mci != NULL)
285 x86_mcinfo_add(mci, &mcb);
287 if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
288 cbret = mc_callback_bank_extended(mci, i, status);
289 }
291 /* By default, need_clear = 1 */
292 if (who != MCA_MCE_SCAN && need_clear)
293 /* Clear status */
294 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
295 else if ( who == MCA_MCE_SCAN && need_clear)
296 set_bit(i, clear_bank);
298 wmb();
299 }
301 if (mci != NULL && errcnt > 0) {
302 x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
303 mig = (struct mcinfo_global *)mic;
304 if (pcc)
305 mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
306 else if (uc)
307 mcg.mc_flags |= MC_FLAG_RECOVERABLE;
308 else
309 mcg.mc_flags |= MC_FLAG_CORRECTABLE;
310 }
313 if (sp) {
314 sp->errcnt = errcnt;
315 sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
316 sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
317 sp->uc = uc;
318 sp->pcc = pcc;
319 sp->recoverable = recover;
320 }
322 return mci != NULL ? mctc : NULL; /* may be NULL */
323 }
325 #define DOM_NORMAL 0
326 #define DOM0_TRAP 1
327 #define DOMU_TRAP 2
328 #define DOMU_KILLED 4
330 /* Shared #MC handler. */
331 void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
332 cpu_banks_t bankmask)
333 {
334 int xen_state_lost, dom0_state_lost, domU_state_lost;
335 struct vcpu *v = current;
336 struct domain *curdom = v->domain;
337 domid_t domid = curdom->domain_id;
338 int ctx_xen, ctx_dom0, ctx_domU;
339 uint32_t dom_state = DOM_NORMAL;
340 mctelem_cookie_t mctc = NULL;
341 struct mca_summary bs;
342 struct mc_info *mci = NULL;
343 int irqlocked = 0;
344 uint64_t gstatus;
345 int ripv;
347 /* This handler runs as interrupt gate. So IPIs from the
348 * polling service routine are defered until we're finished.
349 */
351 /* Disable interrupts for the _vcpu_. It may not re-scheduled to
352 * another physical CPU. */
353 vcpu_schedule_lock_irq(v);
354 irqlocked = 1;
356 /* Read global status; if it does not indicate machine check
357 * in progress then bail as long as we have a valid ip to return to. */
358 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
359 ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
360 if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
361 add_taint(TAINT_MACHINE_CHECK); /* questionable */
362 vcpu_schedule_unlock_irq(v);
363 irqlocked = 0;
364 goto cmn_handler_done;
365 }
367 /* Go and grab error telemetry. We must choose whether to commit
368 * for logging or dismiss the cookie that is returned, and must not
369 * reference the cookie after that action.
370 */
371 mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
372 if (mctc != NULL)
373 mci = (struct mc_info *)mctelem_dataptr(mctc);
375 /* Clear MCIP or another #MC will enter shutdown state */
376 gstatus &= ~MCG_STATUS_MCIP;
377 mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
378 wmb();
380 /* If no valid errors and our stack is intact, we're done */
381 if (ripv && bs.errcnt == 0) {
382 vcpu_schedule_unlock_irq(v);
383 irqlocked = 0;
384 goto cmn_handler_done;
385 }
387 if (bs.uc || bs.pcc)
388 add_taint(TAINT_MACHINE_CHECK);
390 /* Machine check exceptions will usually be for UC and/or PCC errors,
391 * but it is possible to configure machine check for some classes
392 * of corrected error.
393 *
394 * UC errors could compromise any domain or the hypervisor
395 * itself - for example a cache writeback of modified data that
396 * turned out to be bad could be for data belonging to anyone, not
397 * just the current domain. In the absence of known data poisoning
398 * to prevent consumption of such bad data in the system we regard
399 * all UC errors as terminal. It may be possible to attempt some
400 * heuristics based on the address affected, which guests have
401 * mappings to that mfn etc.
402 *
403 * PCC errors apply to the current context.
404 *
405 * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
406 * and not PCC is terminal - the return instruction pointer
407 * pushed onto the stack is bogus. If the interrupt context is
408 * the hypervisor or dom0 the game is over, otherwise we can
409 * limit the impact to a single domU but only if we trampoline
410 * somewhere safely - we can't return and unwind the stack.
411 * Since there is no trampoline in place we will treat !RIPV
412 * as terminal for any context.
413 */
414 ctx_xen = SEG_PL(regs->cs) == 0;
415 ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
416 ctx_domU = !ctx_xen && !ctx_dom0;
418 xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
419 !ripv;
420 dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
421 domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
423 if (xen_state_lost) {
424 /* Now we are going to panic anyway. Allow interrupts, so that
425 * printk on serial console can work. */
426 vcpu_schedule_unlock_irq(v);
427 irqlocked = 0;
429 printk("Terminal machine check exception occurred in "
430 "hypervisor context.\n");
432 /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
433 * to the error then it makes sense to print a stack trace.
434 * That can be useful for more detailed error analysis and/or
435 * error case studies to figure out, if we can clear
436 * xen_impacted and kill a DomU instead
437 * (i.e. if a guest only control structure is affected, but then
438 * we must ensure the bad pages are not re-used again).
439 */
440 if (bs.eipv & MCG_STATUS_EIPV) {
441 printk("MCE: Instruction Pointer is related to the "
442 "error, therefore print the execution state.\n");
443 show_execution_state(regs);
444 }
446 /* Commit the telemetry so that panic flow can find it. */
447 if (mctc != NULL) {
448 x86_mcinfo_dump(mci);
449 mctelem_commit(mctc);
450 }
451 mc_panic("Hypervisor state lost due to machine check "
452 "exception.\n");
453 /*NOTREACHED*/
454 }
456 /*
457 * Xen hypervisor state is intact. If dom0 state is lost then
458 * give it a chance to decide what to do if it has registered
459 * a handler for this event, otherwise panic.
460 *
461 * XXFM Could add some Solaris dom0 contract kill here?
462 */
463 if (dom0_state_lost) {
464 if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
465 dom_state = DOM0_TRAP;
466 send_guest_trap(dom0, 0, TRAP_machine_check);
467 /* XXFM case of return with !ripv ??? */
468 } else {
469 /* Commit telemetry for panic flow. */
470 if (mctc != NULL) {
471 x86_mcinfo_dump(mci);
472 mctelem_commit(mctc);
473 }
474 mc_panic("Dom0 state lost due to machine check "
475 "exception\n");
476 /*NOTREACHED*/
477 }
478 }
480 /*
481 * If a domU has lost state then send it a trap if it has registered
482 * a handler, otherwise crash the domain.
483 * XXFM Revisit this functionality.
484 */
485 if (domU_state_lost) {
486 if (guest_has_trap_callback(v->domain, v->vcpu_id,
487 TRAP_machine_check)) {
488 dom_state = DOMU_TRAP;
489 send_guest_trap(curdom, v->vcpu_id,
490 TRAP_machine_check);
491 } else {
492 dom_state = DOMU_KILLED;
493 /* Enable interrupts. This basically results in
494 * calling sti on the *physical* cpu. But after
495 * domain_crash() the vcpu pointer is invalid.
496 * Therefore, we must unlock the irqs before killing
497 * it. */
498 vcpu_schedule_unlock_irq(v);
499 irqlocked = 0;
501 /* DomU is impacted. Kill it and continue. */
502 domain_crash(curdom);
503 }
504 }
506 switch (dom_state) {
507 case DOM0_TRAP:
508 case DOMU_TRAP:
509 /* Enable interrupts. */
510 vcpu_schedule_unlock_irq(v);
511 irqlocked = 0;
513 /* guest softirqs and event callbacks are scheduled
514 * immediately after this handler exits. */
515 break;
516 case DOMU_KILLED:
517 /* Nothing to do here. */
518 break;
520 case DOM_NORMAL:
521 vcpu_schedule_unlock_irq(v);
522 irqlocked = 0;
523 break;
524 }
526 cmn_handler_done:
527 BUG_ON(irqlocked);
528 BUG_ON(!ripv);
530 if (bs.errcnt) {
531 /* Not panicing, so forward telemetry to dom0 now if it
532 * is interested. */
533 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
534 if (mctc != NULL)
535 mctelem_commit(mctc);
536 send_guest_global_virq(dom0, VIRQ_MCA);
537 } else {
538 x86_mcinfo_dump(mci);
539 if (mctc != NULL)
540 mctelem_dismiss(mctc);
541 }
542 } else if (mctc != NULL) {
543 mctelem_dismiss(mctc);
544 }
545 }
547 void mcheck_mca_clearbanks(cpu_banks_t bankmask)
548 {
549 int i;
550 uint64_t status;
552 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
553 if (!test_bit(i, bankmask))
554 continue;
555 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
556 if (!(status & MCi_STATUS_VAL))
557 continue;
558 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
559 }
560 }
562 static int amd_mcheck_init(struct cpuinfo_x86 *ci)
563 {
564 int rc = 0;
566 switch (ci->x86) {
567 case 6:
568 rc = amd_k7_mcheck_init(ci);
569 break;
571 default:
572 /* Assume that machine check support is available.
573 * The minimum provided support is at least the K8. */
574 case 0xf:
575 rc = amd_k8_mcheck_init(ci);
576 break;
578 case 0x10:
579 case 0x11:
580 rc = amd_f10_mcheck_init(ci);
581 break;
582 }
584 return rc;
585 }
587 /*check the existence of Machine Check*/
588 int mce_available(struct cpuinfo_x86 *c)
589 {
590 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
591 }
593 static int mce_is_broadcast(struct cpuinfo_x86 *c)
594 {
595 if (mce_force_broadcast)
596 return 1;
598 /* According to Intel SDM Dec, 2009, 15.10.4.1, For processors with
599 * DisplayFamily_DisplayModel encoding of 06H_EH and above,
600 * a MCA signal is broadcast to all logical processors in the system
601 */
602 if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
603 c->x86_model >= 0xe)
604 return 1;
605 return 0;
606 }
608 /*
609 * Check if bank 0 is usable for MCE. It isn't for AMD K7,
610 * and Intel P6 family before model 0x1a.
611 */
612 int mce_firstbank(struct cpuinfo_x86 *c)
613 {
614 if (c->x86 == 6) {
615 if (c->x86_vendor == X86_VENDOR_AMD)
616 return 1;
618 if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
619 return 1;
620 }
622 return 0;
623 }
625 /* This has to be run for each processor */
626 void mcheck_init(struct cpuinfo_x86 *c)
627 {
628 int inited = 0, i, broadcast;
629 static int broadcast_check;
631 if (mce_disabled == 1) {
632 dprintk(XENLOG_INFO, "MCE support disabled by bootparam\n");
633 return;
634 }
636 broadcast = mce_is_broadcast(c);
637 if (broadcast_check && (broadcast != mce_broadcast) )
638 dprintk(XENLOG_INFO,
639 "CPUs have mixed broadcast support"
640 "may cause undetermined result!!!\n");
642 broadcast_check = 1;
643 if (broadcast)
644 mce_broadcast = broadcast;
646 for (i = 0; i < MAX_NR_BANKS; i++)
647 set_bit(i,mca_allbanks);
649 /* Enforce at least MCE support in CPUID information. Individual
650 * families may also need to enforce a check for MCA support. */
651 if (!cpu_has(c, X86_FEATURE_MCE)) {
652 printk(XENLOG_INFO "CPU%i: No machine check support available\n",
653 smp_processor_id());
654 return;
655 }
657 intpose_init();
658 mctelem_init(sizeof (struct mc_info));
660 switch (c->x86_vendor) {
661 case X86_VENDOR_AMD:
662 inited = amd_mcheck_init(c);
663 break;
665 case X86_VENDOR_INTEL:
666 switch (c->x86) {
667 case 6:
668 case 15:
669 inited = intel_mcheck_init(c);
670 break;
671 }
672 break;
674 default:
675 break;
676 }
678 if ( !h_mci_ctrl )
679 {
680 h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks);
681 if (!h_mci_ctrl)
682 {
683 dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n");
684 return;
685 }
686 /* Don't care banks before firstbank */
687 memset(h_mci_ctrl, 0xff, sizeof(h_mci_ctrl));
688 for (i = firstbank; i < nr_mce_banks; i++)
689 rdmsrl(MSR_IA32_MC0_CTL + 4*i, h_mci_ctrl[i]);
690 }
691 if (g_mcg_cap & MCG_CTL_P)
692 rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl);
693 set_poll_bankmask(c);
694 if (!inited)
695 printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
696 smp_processor_id());
697 }
699 u64 mce_cap_init(void)
700 {
701 u32 l, h;
702 u64 value;
704 rdmsr(MSR_IA32_MCG_CAP, l, h);
705 value = ((u64)h << 32) | l;
706 /* For Guest vMCE usage */
707 g_mcg_cap = value & ~MCG_CMCI_P;
709 if (l & MCG_CTL_P) /* Control register present ? */
710 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
712 nr_mce_banks = l & MCG_CAP_COUNT;
713 if ( nr_mce_banks > MAX_NR_BANKS )
714 {
715 printk(KERN_WARNING "MCE: exceed max mce banks\n");
716 g_mcg_cap = (g_mcg_cap & ~MCG_CAP_COUNT) | MAX_NR_BANKS;
717 }
719 return value;
720 }
722 /* Guest vMCE# MSRs virtualization ops (rdmsr/wrmsr) */
723 void mce_init_msr(struct domain *d)
724 {
725 d->arch.vmca_msrs.mcg_status = 0x0;
726 d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
727 d->arch.vmca_msrs.mcg_ctl = ~(uint64_t)0x0;
728 d->arch.vmca_msrs.nr_injection = 0;
729 memset(d->arch.vmca_msrs.mci_ctl, ~0,
730 sizeof(d->arch.vmca_msrs.mci_ctl));
731 INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
732 spin_lock_init(&d->arch.vmca_msrs.lock);
733 }
735 int mce_rdmsr(uint32_t msr, uint64_t *val)
736 {
737 struct domain *d = current->domain;
738 int ret = 1;
739 unsigned int bank;
740 struct bank_entry *entry = NULL;
742 *val = 0;
743 spin_lock(&d->arch.vmca_msrs.lock);
745 switch ( msr )
746 {
747 case MSR_IA32_MCG_STATUS:
748 *val = d->arch.vmca_msrs.mcg_status;
749 if (*val)
750 mce_printk(MCE_VERBOSE,
751 "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
752 break;
753 case MSR_IA32_MCG_CAP:
754 *val = d->arch.vmca_msrs.mcg_cap;
755 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
756 *val);
757 break;
758 case MSR_IA32_MCG_CTL:
759 /* Always 0 if no CTL support */
760 *val = d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl;
761 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
762 *val);
763 break;
764 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
765 bank = (msr - MSR_IA32_MC0_CTL) / 4;
766 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
767 {
768 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
769 ret = 0;
770 break;
771 }
772 switch (msr & (MSR_IA32_MC0_CTL | 3))
773 {
774 case MSR_IA32_MC0_CTL:
775 *val = d->arch.vmca_msrs.mci_ctl[bank] &
776 (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
777 mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
778 bank, *val);
779 break;
780 case MSR_IA32_MC0_STATUS:
781 /* Only error bank is read. Non-error banks simply return. */
782 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
783 {
784 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
785 struct bank_entry, list);
786 if (entry->bank == bank) {
787 *val = entry->mci_status;
788 mce_printk(MCE_VERBOSE,
789 "MCE: rd MC%u_STATUS in vMCE# context "
790 "value 0x%"PRIx64"\n", bank, *val);
791 }
792 else
793 entry = NULL;
794 }
795 break;
796 case MSR_IA32_MC0_ADDR:
797 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
798 {
799 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
800 struct bank_entry, list);
801 if ( entry->bank == bank )
802 {
803 *val = entry->mci_addr;
804 mce_printk(MCE_VERBOSE,
805 "MCE: rdmsr MC%u_ADDR in vMCE# context "
806 "0x%"PRIx64"\n", bank, *val);
807 }
808 }
809 break;
810 case MSR_IA32_MC0_MISC:
811 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
812 {
813 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
814 struct bank_entry, list);
815 if ( entry->bank == bank )
816 {
817 *val = entry->mci_misc;
818 mce_printk(MCE_VERBOSE,
819 "MCE: rd MC%u_MISC in vMCE# context "
820 "0x%"PRIx64"\n", bank, *val);
821 }
822 }
823 break;
824 }
825 break;
826 default:
827 switch ( boot_cpu_data.x86_vendor )
828 {
829 case X86_VENDOR_INTEL:
830 ret = intel_mce_rdmsr(msr, val);
831 break;
832 default:
833 ret = 0;
834 break;
835 }
836 break;
837 }
839 spin_unlock(&d->arch.vmca_msrs.lock);
840 return ret;
841 }
843 int mce_wrmsr(u32 msr, u64 val)
844 {
845 struct domain *d = current->domain;
846 struct bank_entry *entry = NULL;
847 unsigned int bank;
848 int ret = 1;
850 if ( !g_mcg_cap )
851 return 0;
853 spin_lock(&d->arch.vmca_msrs.lock);
855 switch ( msr )
856 {
857 case MSR_IA32_MCG_CTL:
858 d->arch.vmca_msrs.mcg_ctl = val;
859 break;
860 case MSR_IA32_MCG_STATUS:
861 d->arch.vmca_msrs.mcg_status = val;
862 mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
863 /* For HVM guest, this is the point for deleting vMCE injection node */
864 if ( d->is_hvm && (d->arch.vmca_msrs.nr_injection > 0) )
865 {
866 d->arch.vmca_msrs.nr_injection--; /* Should be 0 */
867 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
868 {
869 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
870 struct bank_entry, list);
871 if ( entry->mci_status & MCi_STATUS_VAL )
872 mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have "
873 "been cleared before write MCG_STATUS MSR\n");
875 mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
876 "Node, nr_injection %u\n",
877 d->arch.vmca_msrs.nr_injection);
878 list_del(&entry->list);
879 xfree(entry);
880 }
881 else
882 mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
883 " last injection Node, something Wrong!\n");
884 }
885 break;
886 case MSR_IA32_MCG_CAP:
887 mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
888 ret = -1;
889 break;
890 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
891 bank = (msr - MSR_IA32_MC0_CTL) / 4;
892 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
893 {
894 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
895 ret = 0;
896 break;
897 }
898 switch ( msr & (MSR_IA32_MC0_CTL | 3) )
899 {
900 case MSR_IA32_MC0_CTL:
901 d->arch.vmca_msrs.mci_ctl[bank] = val;
902 break;
903 case MSR_IA32_MC0_STATUS:
904 /* Give the first entry of the list, it corresponds to current
905 * vMCE# injection. When vMCE# is finished processing by the
906 * the guest, this node will be deleted.
907 * Only error bank is written. Non-error banks simply return.
908 */
909 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
910 {
911 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
912 struct bank_entry, list);
913 if ( entry->bank == bank )
914 entry->mci_status = val;
915 mce_printk(MCE_VERBOSE,
916 "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
917 bank, val);
918 }
919 else
920 mce_printk(MCE_VERBOSE,
921 "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
922 break;
923 case MSR_IA32_MC0_ADDR:
924 mce_printk(MCE_QUIET, "MCE: MC%u_ADDR is read-only\n", bank);
925 ret = -1;
926 break;
927 case MSR_IA32_MC0_MISC:
928 mce_printk(MCE_QUIET, "MCE: MC%u_MISC is read-only\n", bank);
929 ret = -1;
930 break;
931 }
932 break;
933 default:
934 switch ( boot_cpu_data.x86_vendor )
935 {
936 case X86_VENDOR_INTEL:
937 ret = intel_mce_wrmsr(msr, val);
938 break;
939 default:
940 ret = 0;
941 break;
942 }
943 break;
944 }
946 spin_unlock(&d->arch.vmca_msrs.lock);
947 return ret;
948 }
950 static void mcinfo_clear(struct mc_info *mi)
951 {
952 memset(mi, 0, sizeof(struct mc_info));
953 x86_mcinfo_nentries(mi) = 0;
954 }
956 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
957 {
958 int i;
959 unsigned long end1, end2;
960 struct mcinfo_common *mic, *mic_base, *mic_index;
962 mic = (struct mcinfo_common *)mcinfo;
963 mic_index = mic_base = x86_mcinfo_first(mi);
965 /* go to first free entry */
966 for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
967 mic_index = x86_mcinfo_next(mic_index);
968 }
970 /* check if there is enough size */
971 end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
972 end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
974 if (end1 < end2)
975 return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
977 /* there's enough space. add entry. */
978 memcpy(mic_index, mic, mic->size);
979 x86_mcinfo_nentries(mi)++;
981 return 0;
982 }
984 /* Dump machine check information in a format,
985 * mcelog can parse. This is used only when
986 * Dom0 does not take the notification. */
987 void x86_mcinfo_dump(struct mc_info *mi)
988 {
989 struct mcinfo_common *mic = NULL;
990 struct mcinfo_global *mc_global;
991 struct mcinfo_bank *mc_bank;
993 /* first print the global info */
994 x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
995 if (mic == NULL)
996 return;
997 mc_global = (struct mcinfo_global *)mic;
998 if (mc_global->mc_flags & MC_FLAG_MCE) {
999 printk(XENLOG_WARNING
1000 "CPU%d: Machine Check Exception: %16"PRIx64"\n",
1001 mc_global->mc_coreid, mc_global->mc_gstatus);
1002 } else {
1003 printk(XENLOG_WARNING "MCE: The hardware reports a non "
1004 "fatal, correctable incident occurred on "
1005 "CPU %d.\n",
1006 mc_global->mc_coreid);
1009 /* then the bank information */
1010 x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
1011 do {
1012 if (mic == NULL)
1013 return;
1014 if (mic->type != MC_TYPE_BANK)
1015 goto next;
1017 mc_bank = (struct mcinfo_bank *)mic;
1019 printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
1020 mc_bank->mc_bank,
1021 mc_bank->mc_status);
1022 if (mc_bank->mc_status & MCi_STATUS_MISCV)
1023 printk("[%16"PRIx64"]", mc_bank->mc_misc);
1024 if (mc_bank->mc_status & MCi_STATUS_ADDRV)
1025 printk(" at %16"PRIx64, mc_bank->mc_addr);
1027 printk("\n");
1028 next:
1029 mic = x86_mcinfo_next(mic); /* next entry */
1030 if ((mic == NULL) || (mic->size == 0))
1031 break;
1032 } while (1);
1035 static void do_mc_get_cpu_info(void *v)
1037 int cpu = smp_processor_id();
1038 int cindex, cpn;
1039 struct cpuinfo_x86 *c;
1040 xen_mc_logical_cpu_t *log_cpus, *xcp;
1041 uint32_t junk, ebx;
1043 log_cpus = v;
1044 c = &cpu_data[cpu];
1045 cindex = 0;
1046 cpn = cpu - 1;
1048 /*
1049 * Deal with sparse masks, condensed into a contig array.
1050 */
1051 while (cpn >= 0) {
1052 if (cpu_isset(cpn, cpu_online_map))
1053 cindex++;
1054 cpn--;
1057 xcp = &log_cpus[cindex];
1058 c = &cpu_data[cpu];
1059 xcp->mc_cpunr = cpu;
1060 x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
1061 &xcp->mc_coreid, &xcp->mc_threadid,
1062 &xcp->mc_apicid, &xcp->mc_ncores,
1063 &xcp->mc_ncores_active, &xcp->mc_nthreads);
1064 xcp->mc_cpuid_level = c->cpuid_level;
1065 xcp->mc_family = c->x86;
1066 xcp->mc_vendor = c->x86_vendor;
1067 xcp->mc_model = c->x86_model;
1068 xcp->mc_step = c->x86_mask;
1069 xcp->mc_cache_size = c->x86_cache_size;
1070 xcp->mc_cache_alignment = c->x86_cache_alignment;
1071 memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
1072 memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
1073 memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
1075 /*
1076 * This part needs to run on the CPU itself.
1077 */
1078 xcp->mc_nmsrvals = __MC_NMSRS;
1079 xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
1080 rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
1082 if (c->cpuid_level >= 1) {
1083 cpuid(1, &junk, &ebx, &junk, &junk);
1084 xcp->mc_clusterid = (ebx >> 24) & 0xff;
1085 } else
1086 xcp->mc_clusterid = hard_smp_processor_id();
1090 void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
1091 uint16_t *threadid, uint32_t *apicid,
1092 unsigned *ncores, unsigned *ncores_active,
1093 unsigned *nthreads)
1095 struct cpuinfo_x86 *c;
1097 *apicid = cpu_physical_id(cpu);
1098 c = &cpu_data[cpu];
1099 if (c->apicid == BAD_APICID) {
1100 *chipid = cpu;
1101 *coreid = 0;
1102 *threadid = 0;
1103 if (ncores != NULL)
1104 *ncores = 1;
1105 if (ncores_active != NULL)
1106 *ncores_active = 1;
1107 if (nthreads != NULL)
1108 *nthreads = 1;
1109 } else {
1110 *chipid = phys_proc_id[cpu];
1111 if (c->x86_max_cores > 1)
1112 *coreid = cpu_core_id[cpu];
1113 else
1114 *coreid = 0;
1115 *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
1116 if (ncores != NULL)
1117 *ncores = c->x86_max_cores;
1118 if (ncores_active != NULL)
1119 *ncores_active = c->booted_cores;
1120 if (nthreads != NULL)
1121 *nthreads = c->x86_num_siblings;
1125 #define INTPOSE_NENT 50
1127 static struct intpose_ent {
1128 unsigned int cpu_nr;
1129 uint64_t msr;
1130 uint64_t val;
1131 } intpose_arr[INTPOSE_NENT];
1133 static void intpose_init(void)
1135 static int done;
1136 int i;
1138 if (done++ > 0)
1139 return;
1141 for (i = 0; i < INTPOSE_NENT; i++) {
1142 intpose_arr[i].cpu_nr = -1;
1147 struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
1148 uint64_t *valp)
1150 int i;
1152 for (i = 0; i < INTPOSE_NENT; i++) {
1153 if (intpose_arr[i].cpu_nr == cpu_nr &&
1154 intpose_arr[i].msr == msr) {
1155 if (valp != NULL)
1156 *valp = intpose_arr[i].val;
1157 return &intpose_arr[i];
1161 return NULL;
1164 static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
1166 struct intpose_ent *ent;
1167 int i;
1169 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1170 ent->val = val;
1171 return;
1174 for (i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++) {
1175 if (ent->cpu_nr == -1) {
1176 ent->cpu_nr = cpu_nr;
1177 ent->msr = msr;
1178 ent->val = val;
1179 return;
1183 printk("intpose_add: interpose array full - request dropped\n");
1186 void intpose_inval(unsigned int cpu_nr, uint64_t msr)
1188 struct intpose_ent *ent;
1190 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1191 ent->cpu_nr = -1;
1195 #define IS_MCA_BANKREG(r) \
1196 ((r) >= MSR_IA32_MC0_CTL && \
1197 (r) <= MSR_IA32_MC0_MISC + (nr_mce_banks - 1) * 4 && \
1198 ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
1200 int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain *d)
1202 int bank_nr;
1204 if ( !bank || !d || !h_mci_ctrl )
1205 return 1;
1207 /* Will MCE happen in host if If host mcg_ctl is 0? */
1208 if ( ~d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl )
1209 return 1;
1211 bank_nr = bank->mc_bank;
1212 if (~d->arch.vmca_msrs.mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] )
1213 return 1;
1214 return 0;
1217 static int x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
1219 struct cpuinfo_x86 *c;
1220 int i, errs = 0;
1222 c = &cpu_data[smp_processor_id()];
1224 for (i = 0; i < mci->mcinj_count; i++) {
1225 uint64_t reg = mci->mcinj_msr[i].reg;
1226 const char *reason = NULL;
1228 if (IS_MCA_BANKREG(reg)) {
1229 if (c->x86_vendor == X86_VENDOR_AMD) {
1230 /* On AMD we can set MCi_STATUS_WREN in the
1231 * HWCR MSR to allow non-zero writes to banks
1232 * MSRs not to #GP. The injector in dom0
1233 * should set that bit, but we detect when it
1234 * is necessary and set it as a courtesy to
1235 * avoid #GP in the hypervisor. */
1236 mci->mcinj_flags |=
1237 _MC_MSRINJ_F_REQ_HWCR_WREN;
1238 continue;
1239 } else {
1240 /* No alternative but to interpose, so require
1241 * that the injector specified as such. */
1242 if (!(mci->mcinj_flags &
1243 MC_MSRINJ_F_INTERPOSE)) {
1244 reason = "must specify interposition";
1247 } else {
1248 switch (reg) {
1249 /* MSRs acceptable on all x86 cpus */
1250 case MSR_IA32_MCG_STATUS:
1251 break;
1253 /* MSRs that the HV will take care of */
1254 case MSR_K8_HWCR:
1255 if (c->x86_vendor == X86_VENDOR_AMD)
1256 reason = "HV will operate HWCR";
1257 else
1258 reason ="only supported on AMD";
1259 break;
1261 default:
1262 reason = "not a recognized MCA MSR";
1263 break;
1267 if (reason != NULL) {
1268 printk("HV MSR INJECT ERROR: MSR 0x%llx %s\n",
1269 (unsigned long long)mci->mcinj_msr[i].reg, reason);
1270 errs++;
1274 return !errs;
1277 static uint64_t x86_mc_hwcr_wren(void)
1279 uint64_t old;
1281 rdmsrl(MSR_K8_HWCR, old);
1283 if (!(old & K8_HWCR_MCi_STATUS_WREN)) {
1284 uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
1285 wrmsrl(MSR_K8_HWCR, new);
1288 return old;
1291 static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
1293 if (!(hwcr & K8_HWCR_MCi_STATUS_WREN))
1294 wrmsrl(MSR_K8_HWCR, hwcr);
1297 static void x86_mc_msrinject(void *data)
1299 struct xen_mc_msrinject *mci = data;
1300 struct mcinfo_msr *msr;
1301 struct cpuinfo_x86 *c;
1302 uint64_t hwcr = 0;
1303 int intpose;
1304 int i;
1306 c = &cpu_data[smp_processor_id()];
1308 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1309 hwcr = x86_mc_hwcr_wren();
1311 intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
1313 for (i = 0, msr = &mci->mcinj_msr[0];
1314 i < mci->mcinj_count; i++, msr++) {
1315 printk("HV MSR INJECT (%s) target %u actual %u MSR 0x%llx "
1316 "<-- 0x%llx\n",
1317 intpose ? "interpose" : "hardware",
1318 mci->mcinj_cpunr, smp_processor_id(),
1319 (unsigned long long)msr->reg,
1320 (unsigned long long)msr->value);
1322 if (intpose)
1323 intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
1324 else
1325 wrmsrl(msr->reg, msr->value);
1328 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1329 x86_mc_hwcr_wren_restore(hwcr);
1332 /*ARGSUSED*/
1333 static void x86_mc_mceinject(void *data)
1335 printk("Simulating #MC on cpu %d\n", smp_processor_id());
1336 __asm__ __volatile__("int $0x12");
1339 #if BITS_PER_LONG == 64
1341 #define ID2COOKIE(id) ((mctelem_cookie_t)(id))
1342 #define COOKIE2ID(c) ((uint64_t)(c))
1344 #elif BITS_PER_LONG == 32
1346 #define ID2COOKIE(id) ((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU))
1347 #define COOKIE2ID(c) ((uint64_t)(uint32_t)(c))
1349 #elif defined(BITS_PER_LONG)
1350 #error BITS_PER_LONG has unexpected value
1351 #else
1352 #error BITS_PER_LONG definition absent
1353 #endif
1355 #ifdef CONFIG_COMPAT
1356 # include <compat/arch-x86/xen-mca.h>
1358 # define xen_mcinfo_msr mcinfo_msr
1359 CHECK_mcinfo_msr;
1360 # undef xen_mcinfo_msr
1361 # undef CHECK_mcinfo_msr
1362 # define CHECK_mcinfo_msr struct mcinfo_msr
1364 # define xen_mcinfo_common mcinfo_common
1365 CHECK_mcinfo_common;
1366 # undef xen_mcinfo_common
1367 # undef CHECK_mcinfo_common
1368 # define CHECK_mcinfo_common struct mcinfo_common
1370 CHECK_FIELD_(struct, mc_fetch, flags);
1371 CHECK_FIELD_(struct, mc_fetch, fetch_id);
1372 # define CHECK_compat_mc_fetch struct mc_fetch
1374 CHECK_FIELD_(struct, mc_physcpuinfo, ncpus);
1375 # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo
1377 CHECK_mc;
1378 # undef CHECK_compat_mc_fetch
1379 # undef CHECK_compat_mc_physcpuinfo
1381 # define xen_mc_info mc_info
1382 CHECK_mc_info;
1383 # undef xen_mc_info
1385 # define xen_mcinfo_global mcinfo_global
1386 CHECK_mcinfo_global;
1387 # undef xen_mcinfo_global
1389 # define xen_mcinfo_bank mcinfo_bank
1390 CHECK_mcinfo_bank;
1391 # undef xen_mcinfo_bank
1393 # define xen_mcinfo_extended mcinfo_extended
1394 CHECK_mcinfo_extended;
1395 # undef xen_mcinfo_extended
1397 # define xen_mcinfo_recovery mcinfo_recovery
1398 # define xen_cpu_offline_action cpu_offline_action
1399 # define xen_page_offline_action page_offline_action
1400 CHECK_mcinfo_recovery;
1401 # undef xen_cpu_offline_action
1402 # undef xen_page_offline_action
1403 # undef xen_mcinfo_recovery
1404 #else
1405 # define compat_mc_fetch xen_mc_fetch
1406 # define compat_mc_physcpuinfo xen_mc_physcpuinfo
1407 # define compat_handle_is_null guest_handle_is_null
1408 # define copy_to_compat copy_to_guest
1409 #endif
1411 /* Machine Check Architecture Hypercall */
1412 long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
1414 long ret = 0;
1415 struct xen_mc curop, *op = &curop;
1416 struct vcpu *v = current;
1417 union {
1418 struct xen_mc_fetch *nat;
1419 struct compat_mc_fetch *cmp;
1420 } mc_fetch;
1421 union {
1422 struct xen_mc_physcpuinfo *nat;
1423 struct compat_mc_physcpuinfo *cmp;
1424 } mc_physcpuinfo;
1425 uint32_t flags, cmdflags;
1426 int nlcpu;
1427 xen_mc_logical_cpu_t *log_cpus = NULL;
1428 mctelem_cookie_t mctc;
1429 mctelem_class_t which;
1430 unsigned int target;
1431 struct xen_mc_msrinject *mc_msrinject;
1432 struct xen_mc_mceinject *mc_mceinject;
1434 if (!IS_PRIV(v->domain) )
1435 return x86_mcerr(NULL, -EPERM);
1437 if ( copy_from_guest(op, u_xen_mc, 1) )
1438 return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
1440 if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
1441 return x86_mcerr("do_mca: interface version mismatch", -EACCES);
1443 switch (op->cmd) {
1444 case XEN_MC_fetch:
1445 mc_fetch.nat = &op->u.mc_fetch;
1446 cmdflags = mc_fetch.nat->flags;
1448 switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
1449 case XEN_MC_NONURGENT:
1450 which = MC_NONURGENT;
1451 break;
1453 case XEN_MC_URGENT:
1454 which = MC_URGENT;
1455 break;
1457 default:
1458 return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
1461 flags = XEN_MC_OK;
1463 if (cmdflags & XEN_MC_ACK) {
1464 mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
1465 mctelem_ack(which, cookie);
1466 } else {
1467 if (!is_pv_32on64_vcpu(v)
1468 ? guest_handle_is_null(mc_fetch.nat->data)
1469 : compat_handle_is_null(mc_fetch.cmp->data))
1470 return x86_mcerr("do_mca fetch: guest buffer "
1471 "invalid", -EINVAL);
1473 if ((mctc = mctelem_consume_oldest_begin(which))) {
1474 struct mc_info *mcip = mctelem_dataptr(mctc);
1475 if (!is_pv_32on64_vcpu(v)
1476 ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
1477 : copy_to_compat(mc_fetch.cmp->data,
1478 mcip, 1)) {
1479 ret = -EFAULT;
1480 flags |= XEN_MC_FETCHFAILED;
1481 mc_fetch.nat->fetch_id = 0;
1482 } else {
1483 mc_fetch.nat->fetch_id = COOKIE2ID(mctc);
1485 mctelem_consume_oldest_end(mctc);
1486 } else {
1487 /* There is no data */
1488 flags |= XEN_MC_NODATA;
1489 mc_fetch.nat->fetch_id = 0;
1492 mc_fetch.nat->flags = flags;
1493 if (copy_to_guest(u_xen_mc, op, 1) != 0)
1494 ret = -EFAULT;
1497 break;
1499 case XEN_MC_notifydomain:
1500 return x86_mcerr("do_mca notify unsupported", -EINVAL);
1502 case XEN_MC_physcpuinfo:
1503 mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
1504 nlcpu = num_online_cpus();
1506 if (!is_pv_32on64_vcpu(v)
1507 ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
1508 : !compat_handle_is_null(mc_physcpuinfo.cmp->info)) {
1509 if (mc_physcpuinfo.nat->ncpus <= 0)
1510 return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
1511 -EINVAL);
1512 nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus);
1513 log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
1514 if (log_cpus == NULL)
1515 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
1517 if (on_each_cpu(do_mc_get_cpu_info, log_cpus, 1)) {
1518 xfree(log_cpus);
1519 return x86_mcerr("do_mca cpuinfo", -EIO);
1521 if (!is_pv_32on64_vcpu(v)
1522 ? copy_to_guest(mc_physcpuinfo.nat->info,
1523 log_cpus, nlcpu)
1524 : copy_to_compat(mc_physcpuinfo.cmp->info,
1525 log_cpus, nlcpu))
1526 ret = -EFAULT;
1527 xfree(log_cpus);
1530 mc_physcpuinfo.nat->ncpus = nlcpu;
1532 if (copy_to_guest(u_xen_mc, op, 1))
1533 return x86_mcerr("do_mca cpuinfo", -EFAULT);
1535 break;
1537 case XEN_MC_msrinject:
1538 if (nr_mce_banks == 0)
1539 return x86_mcerr("do_mca inject", -ENODEV);
1541 mc_msrinject = &op->u.mc_msrinject;
1542 target = mc_msrinject->mcinj_cpunr;
1544 if (target >= NR_CPUS)
1545 return x86_mcerr("do_mca inject: bad target", -EINVAL);
1547 if (!cpu_isset(target, cpu_online_map))
1548 return x86_mcerr("do_mca inject: target offline",
1549 -EINVAL);
1551 if (mc_msrinject->mcinj_count == 0)
1552 return 0;
1554 if (!x86_mc_msrinject_verify(mc_msrinject))
1555 return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
1557 add_taint(TAINT_ERROR_INJECT);
1559 on_selected_cpus(cpumask_of(target), x86_mc_msrinject,
1560 mc_msrinject, 1);
1562 break;
1564 case XEN_MC_mceinject:
1565 if (nr_mce_banks == 0)
1566 return x86_mcerr("do_mca #MC", -ENODEV);
1568 mc_mceinject = &op->u.mc_mceinject;
1569 target = mc_mceinject->mceinj_cpunr;
1571 if (target >= NR_CPUS)
1572 return x86_mcerr("do_mca #MC: bad target", -EINVAL);
1574 if (!cpu_isset(target, cpu_online_map))
1575 return x86_mcerr("do_mca #MC: target offline", -EINVAL);
1577 add_taint(TAINT_ERROR_INJECT);
1579 if ( mce_broadcast )
1580 on_each_cpu(x86_mc_mceinject, mc_mceinject, 0);
1581 else
1582 on_selected_cpus(cpumask_of(target), x86_mc_mceinject,
1583 mc_mceinject, 1);
1584 break;
1586 default:
1587 return x86_mcerr("do_mca: bad command", -EINVAL);
1590 return ret;
1592 void set_poll_bankmask(struct cpuinfo_x86 *c)
1595 if (cmci_support && !mce_disabled) {
1596 memcpy(&(__get_cpu_var(poll_bankmask)),
1597 &(__get_cpu_var(no_cmci_banks)), sizeof(cpu_banks_t));
1599 else {
1600 memcpy(&(get_cpu_var(poll_bankmask)), &mca_allbanks, sizeof(cpu_banks_t));
1601 if (mce_firstbank(c))
1602 clear_bit(0, get_cpu_var(poll_bankmask));
1605 void mc_panic(char *s)
1607 is_mc_panic = 1;
1608 console_start_sync();
1609 printk("Fatal machine check: %s\n", s);
1610 printk("\n"
1611 "****************************************\n"
1612 "\n"
1613 " The processor has reported a hardware error which cannot\n"
1614 " be recovered from. Xen will now reboot the machine.\n");
1615 panic("HARDWARE ERROR");