debuggers.hg

view xen/arch/x86/cpu/mcheck/mce.c @ 20907:f85120520509

x86 mca: Handle the vMCA bank correctly

Currently the virtual MCE MSR assume all MSRs range from 0 to
MAX_NR_BANKS are always MCE MSR, this is not always correct. With this
patch, the mce_rdmsr/mce_wrmsr will only handle vMCE MSR range from 0
to the MCA banks in the host platform.
Please notice that some MSR beyond current MCA banks in the host
platform are really MCA MSRs, that should be handled by general MSR
handler.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jan 29 06:47:24 2010 +0000 (2010-01-29)
parents 30bfa1d8895d
children 7310235f74f8
line source
1 /*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4 */
6 #include <xen/init.h>
7 #include <xen/types.h>
8 #include <xen/kernel.h>
9 #include <xen/config.h>
10 #include <xen/smp.h>
11 #include <xen/errno.h>
12 #include <xen/console.h>
13 #include <xen/sched.h>
14 #include <xen/sched-if.h>
15 #include <xen/cpumask.h>
16 #include <xen/event.h>
17 #include <xen/guest_access.h>
18 #include <xen/hypercall.h> /* for do_mca */
20 #include <asm/processor.h>
21 #include <asm/system.h>
22 #include <asm/msr.h>
24 #include "mce.h"
26 int mce_disabled;
27 invbool_param("mce", mce_disabled);
29 int is_mc_panic;
30 unsigned int nr_mce_banks;
32 static uint64_t g_mcg_cap;
34 static void intpose_init(void);
35 static void mcinfo_clear(struct mc_info *);
37 #define SEG_PL(segsel) ((segsel) & 0x3)
38 #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16)
40 #if 0
41 static int x86_mcerr(const char *msg, int err)
42 {
43 gdprintk(XENLOG_WARNING, "x86_mcerr: %s, returning %d\n",
44 msg != NULL ? msg : "", err);
45 return err;
46 }
47 #else
48 #define x86_mcerr(msg, err) (err)
49 #endif
51 cpu_banks_t mca_allbanks;
53 int mce_verbosity;
54 static void __init mce_set_verbosity(char *str)
55 {
56 if (strcmp("verbose", str) == 0)
57 mce_verbosity = MCE_VERBOSE;
58 else
59 printk(KERN_DEBUG "Machine Check verbosity level %s not recognised"
60 "use mce_verbosity=verbose", str);
61 }
62 custom_param("mce_verbosity", mce_set_verbosity);
64 /* Handle unconfigured int18 (should never happen) */
65 static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
66 {
67 printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
68 smp_processor_id());
69 }
72 static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
74 void x86_mce_vector_register(x86_mce_vector_t hdlr)
75 {
76 _machine_check_vector = hdlr;
77 wmb();
78 }
80 /* Call the installed machine check handler for this CPU setup. */
82 void machine_check_vector(struct cpu_user_regs *regs, long error_code)
83 {
84 _machine_check_vector(regs, error_code);
85 }
87 /* Init machine check callback handler
88 * It is used to collect additional information provided by newer
89 * CPU families/models without the need to duplicate the whole handler.
90 * This avoids having many handlers doing almost nearly the same and each
91 * with its own tweaks ands bugs. */
92 static x86_mce_callback_t mc_callback_bank_extended = NULL;
94 void x86_mce_callback_register(x86_mce_callback_t cbfunc)
95 {
96 mc_callback_bank_extended = cbfunc;
97 }
99 /* Machine check recoverable judgement callback handler
100 * It is used to judge whether an UC error is recoverable by software
101 */
102 static mce_recoverable_t mc_recoverable_scan = NULL;
104 void mce_recoverable_register(mce_recoverable_t cbfunc)
105 {
106 mc_recoverable_scan = cbfunc;
107 }
109 /* Judging whether to Clear Machine Check error bank callback handler
110 * According to Intel latest MCA OS Recovery Writer's Guide,
111 * whether the error MCA bank needs to be cleared is decided by the mca_source
112 * and MCi_status bit value.
113 */
114 static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
116 void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
117 {
118 mc_need_clearbank_scan = cbfunc;
119 }
121 /* Utility function to perform MCA bank telemetry readout and to push that
122 * telemetry towards an interested dom0 for logging and diagnosis.
123 * The caller - #MC handler or MCA poll function - must arrange that we
124 * do not migrate cpus. */
126 /* XXFM Could add overflow counting? */
128 /* Add out_param clear_bank for Machine Check Handler Caller.
129 * For Intel latest CPU, whether to clear the error bank status needs to
130 * be judged by the callback function defined above.
131 */
132 mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
133 struct mca_summary *sp, cpu_banks_t* clear_bank)
134 {
135 struct vcpu *v = current;
136 struct domain *d;
137 uint64_t gstatus, status, addr, misc;
138 struct mcinfo_global mcg; /* on stack */
139 struct mcinfo_common *mic;
140 struct mcinfo_global *mig; /* on stack */
141 mctelem_cookie_t mctc = NULL;
142 uint32_t uc = 0, pcc = 0, recover, need_clear = 1 ;
143 struct mc_info *mci = NULL;
144 mctelem_class_t which = MC_URGENT; /* XXXgcc */
145 unsigned int cpu_nr;
146 int errcnt = 0;
147 int i;
148 enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
150 cpu_nr = smp_processor_id();
151 BUG_ON(cpu_nr != v->processor);
153 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
155 memset(&mcg, 0, sizeof (mcg));
156 mcg.common.type = MC_TYPE_GLOBAL;
157 mcg.common.size = sizeof (mcg);
158 if (v != NULL && ((d = v->domain) != NULL)) {
159 mcg.mc_domid = d->domain_id;
160 mcg.mc_vcpuid = v->vcpu_id;
161 } else {
162 mcg.mc_domid = -1;
163 mcg.mc_vcpuid = -1;
164 }
165 mcg.mc_gstatus = gstatus; /* MCG_STATUS */
167 switch (who) {
168 case MCA_MCE_HANDLER:
169 case MCA_MCE_SCAN:
170 mcg.mc_flags = MC_FLAG_MCE;
171 which = MC_URGENT;
172 break;
174 case MCA_POLLER:
175 case MCA_RESET:
176 mcg.mc_flags = MC_FLAG_POLLED;
177 which = MC_NONURGENT;
178 break;
180 case MCA_CMCI_HANDLER:
181 mcg.mc_flags = MC_FLAG_CMCI;
182 which = MC_NONURGENT;
183 break;
185 default:
186 BUG();
187 }
189 /* Retrieve detector information */
190 x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
191 &mcg.mc_coreid, &mcg.mc_core_threadid,
192 &mcg.mc_apicid, NULL, NULL, NULL);
194 /* If no mc_recovery_scan callback handler registered,
195 * this error is not recoverable
196 */
197 recover = (mc_recoverable_scan)? 1: 0;
199 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
200 struct mcinfo_bank mcb; /* on stack */
202 /* Skip bank if corresponding bit in bankmask is clear */
203 if (!test_bit(i, bankmask))
204 continue;
206 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
207 if (!(status & MCi_STATUS_VAL))
208 continue; /* this bank has no valid telemetry */
210 /* For Intel Latest CPU CMCI/MCE Handler caller, we need to
211 * decide whether to clear bank by MCi_STATUS bit value such as
212 * OVER/UC/EN/PCC/S/AR
213 */
214 if ( mc_need_clearbank_scan )
215 need_clear = mc_need_clearbank_scan(who, status);
217 /* If this is the first bank with valid MCA DATA, then
218 * try to reserve an entry from the urgent/nonurgent queue
219 * depending on whethere we are called from an exception or
220 * a poller; this can fail (for example dom0 may not
221 * yet have consumed past telemetry). */
222 if (errcnt == 0) {
223 if ((mctc = mctelem_reserve(which)) != NULL) {
224 mci = mctelem_dataptr(mctc);
225 mcinfo_clear(mci);
226 }
227 }
229 memset(&mcb, 0, sizeof (mcb));
230 mcb.common.type = MC_TYPE_BANK;
231 mcb.common.size = sizeof (mcb);
232 mcb.mc_bank = i;
233 mcb.mc_status = status;
235 /* form a mask of which banks have logged uncorrected errors */
236 if ((status & MCi_STATUS_UC) != 0)
237 uc |= (1 << i);
239 /* likewise for those with processor context corrupt */
240 if ((status & MCi_STATUS_PCC) != 0)
241 pcc |= (1 << i);
243 if (recover && uc)
244 /* uc = 1, recover = 1, we need not panic.
245 */
246 recover = mc_recoverable_scan(status);
248 addr = misc = 0;
250 if (status & MCi_STATUS_ADDRV) {
251 mca_rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
252 if (mfn_valid(paddr_to_pfn(addr))) {
253 d = maddr_get_owner(addr);
254 if (d != NULL && (who == MCA_POLLER ||
255 who == MCA_CMCI_HANDLER))
256 mcb.mc_domid = d->domain_id;
257 }
258 }
260 if (status & MCi_STATUS_MISCV)
261 mca_rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
263 mcb.mc_addr = addr;
264 mcb.mc_misc = misc;
266 if (who == MCA_CMCI_HANDLER) {
267 mca_rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
268 rdtscll(mcb.mc_tsc);
269 }
271 /* Increment the error count; if this is the first bank
272 * with a valid error then add the global info to the mcinfo. */
273 if (errcnt++ == 0 && mci != NULL)
274 x86_mcinfo_add(mci, &mcg);
276 /* Add the bank data */
277 if (mci != NULL)
278 x86_mcinfo_add(mci, &mcb);
280 if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
281 cbret = mc_callback_bank_extended(mci, i, status);
282 }
284 /* By default, need_clear = 1 */
285 if (who != MCA_MCE_SCAN && need_clear)
286 /* Clear status */
287 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
288 else if ( who == MCA_MCE_SCAN && need_clear)
289 set_bit(i, clear_bank);
291 wmb();
292 }
294 if (mci != NULL && errcnt > 0) {
295 x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
296 mig = (struct mcinfo_global *)mic;
297 if (pcc)
298 mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
299 else if (uc)
300 mcg.mc_flags |= MC_FLAG_RECOVERABLE;
301 else
302 mcg.mc_flags |= MC_FLAG_CORRECTABLE;
303 }
306 if (sp) {
307 sp->errcnt = errcnt;
308 sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
309 sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
310 sp->uc = uc;
311 sp->pcc = pcc;
312 sp->recoverable = recover;
313 }
315 return mci != NULL ? mctc : NULL; /* may be NULL */
316 }
318 #define DOM_NORMAL 0
319 #define DOM0_TRAP 1
320 #define DOMU_TRAP 2
321 #define DOMU_KILLED 4
323 /* Shared #MC handler. */
324 void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
325 cpu_banks_t bankmask)
326 {
327 int xen_state_lost, dom0_state_lost, domU_state_lost;
328 struct vcpu *v = current;
329 struct domain *curdom = v->domain;
330 domid_t domid = curdom->domain_id;
331 int ctx_xen, ctx_dom0, ctx_domU;
332 uint32_t dom_state = DOM_NORMAL;
333 mctelem_cookie_t mctc = NULL;
334 struct mca_summary bs;
335 struct mc_info *mci = NULL;
336 int irqlocked = 0;
337 uint64_t gstatus;
338 int ripv;
340 /* This handler runs as interrupt gate. So IPIs from the
341 * polling service routine are defered until we're finished.
342 */
344 /* Disable interrupts for the _vcpu_. It may not re-scheduled to
345 * another physical CPU. */
346 vcpu_schedule_lock_irq(v);
347 irqlocked = 1;
349 /* Read global status; if it does not indicate machine check
350 * in progress then bail as long as we have a valid ip to return to. */
351 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
352 ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
353 if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
354 add_taint(TAINT_MACHINE_CHECK); /* questionable */
355 vcpu_schedule_unlock_irq(v);
356 irqlocked = 0;
357 goto cmn_handler_done;
358 }
360 /* Go and grab error telemetry. We must choose whether to commit
361 * for logging or dismiss the cookie that is returned, and must not
362 * reference the cookie after that action.
363 */
364 mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
365 if (mctc != NULL)
366 mci = (struct mc_info *)mctelem_dataptr(mctc);
368 /* Clear MCIP or another #MC will enter shutdown state */
369 gstatus &= ~MCG_STATUS_MCIP;
370 mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
371 wmb();
373 /* If no valid errors and our stack is intact, we're done */
374 if (ripv && bs.errcnt == 0) {
375 vcpu_schedule_unlock_irq(v);
376 irqlocked = 0;
377 goto cmn_handler_done;
378 }
380 if (bs.uc || bs.pcc)
381 add_taint(TAINT_MACHINE_CHECK);
383 /* Machine check exceptions will usually be for UC and/or PCC errors,
384 * but it is possible to configure machine check for some classes
385 * of corrected error.
386 *
387 * UC errors could compromise any domain or the hypervisor
388 * itself - for example a cache writeback of modified data that
389 * turned out to be bad could be for data belonging to anyone, not
390 * just the current domain. In the absence of known data poisoning
391 * to prevent consumption of such bad data in the system we regard
392 * all UC errors as terminal. It may be possible to attempt some
393 * heuristics based on the address affected, which guests have
394 * mappings to that mfn etc.
395 *
396 * PCC errors apply to the current context.
397 *
398 * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
399 * and not PCC is terminal - the return instruction pointer
400 * pushed onto the stack is bogus. If the interrupt context is
401 * the hypervisor or dom0 the game is over, otherwise we can
402 * limit the impact to a single domU but only if we trampoline
403 * somewhere safely - we can't return and unwind the stack.
404 * Since there is no trampoline in place we will treat !RIPV
405 * as terminal for any context.
406 */
407 ctx_xen = SEG_PL(regs->cs) == 0;
408 ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
409 ctx_domU = !ctx_xen && !ctx_dom0;
411 xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
412 !ripv;
413 dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
414 domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
416 if (xen_state_lost) {
417 /* Now we are going to panic anyway. Allow interrupts, so that
418 * printk on serial console can work. */
419 vcpu_schedule_unlock_irq(v);
420 irqlocked = 0;
422 printk("Terminal machine check exception occurred in "
423 "hypervisor context.\n");
425 /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
426 * to the error then it makes sense to print a stack trace.
427 * That can be useful for more detailed error analysis and/or
428 * error case studies to figure out, if we can clear
429 * xen_impacted and kill a DomU instead
430 * (i.e. if a guest only control structure is affected, but then
431 * we must ensure the bad pages are not re-used again).
432 */
433 if (bs.eipv & MCG_STATUS_EIPV) {
434 printk("MCE: Instruction Pointer is related to the "
435 "error, therefore print the execution state.\n");
436 show_execution_state(regs);
437 }
439 /* Commit the telemetry so that panic flow can find it. */
440 if (mctc != NULL) {
441 x86_mcinfo_dump(mci);
442 mctelem_commit(mctc);
443 }
444 mc_panic("Hypervisor state lost due to machine check "
445 "exception.\n");
446 /*NOTREACHED*/
447 }
449 /*
450 * Xen hypervisor state is intact. If dom0 state is lost then
451 * give it a chance to decide what to do if it has registered
452 * a handler for this event, otherwise panic.
453 *
454 * XXFM Could add some Solaris dom0 contract kill here?
455 */
456 if (dom0_state_lost) {
457 if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
458 dom_state = DOM0_TRAP;
459 send_guest_trap(dom0, 0, TRAP_machine_check);
460 /* XXFM case of return with !ripv ??? */
461 } else {
462 /* Commit telemetry for panic flow. */
463 if (mctc != NULL) {
464 x86_mcinfo_dump(mci);
465 mctelem_commit(mctc);
466 }
467 mc_panic("Dom0 state lost due to machine check "
468 "exception\n");
469 /*NOTREACHED*/
470 }
471 }
473 /*
474 * If a domU has lost state then send it a trap if it has registered
475 * a handler, otherwise crash the domain.
476 * XXFM Revisit this functionality.
477 */
478 if (domU_state_lost) {
479 if (guest_has_trap_callback(v->domain, v->vcpu_id,
480 TRAP_machine_check)) {
481 dom_state = DOMU_TRAP;
482 send_guest_trap(curdom, v->vcpu_id,
483 TRAP_machine_check);
484 } else {
485 dom_state = DOMU_KILLED;
486 /* Enable interrupts. This basically results in
487 * calling sti on the *physical* cpu. But after
488 * domain_crash() the vcpu pointer is invalid.
489 * Therefore, we must unlock the irqs before killing
490 * it. */
491 vcpu_schedule_unlock_irq(v);
492 irqlocked = 0;
494 /* DomU is impacted. Kill it and continue. */
495 domain_crash(curdom);
496 }
497 }
499 switch (dom_state) {
500 case DOM0_TRAP:
501 case DOMU_TRAP:
502 /* Enable interrupts. */
503 vcpu_schedule_unlock_irq(v);
504 irqlocked = 0;
506 /* guest softirqs and event callbacks are scheduled
507 * immediately after this handler exits. */
508 break;
509 case DOMU_KILLED:
510 /* Nothing to do here. */
511 break;
513 case DOM_NORMAL:
514 vcpu_schedule_unlock_irq(v);
515 irqlocked = 0;
516 break;
517 }
519 cmn_handler_done:
520 BUG_ON(irqlocked);
521 BUG_ON(!ripv);
523 if (bs.errcnt) {
524 /* Not panicing, so forward telemetry to dom0 now if it
525 * is interested. */
526 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
527 if (mctc != NULL)
528 mctelem_commit(mctc);
529 send_guest_global_virq(dom0, VIRQ_MCA);
530 } else {
531 x86_mcinfo_dump(mci);
532 if (mctc != NULL)
533 mctelem_dismiss(mctc);
534 }
535 } else if (mctc != NULL) {
536 mctelem_dismiss(mctc);
537 }
538 }
540 void mcheck_mca_clearbanks(cpu_banks_t bankmask)
541 {
542 int i;
543 uint64_t status;
545 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
546 if (!test_bit(i, bankmask))
547 continue;
548 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
549 if (!(status & MCi_STATUS_VAL))
550 continue;
551 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
552 }
553 }
555 static int amd_mcheck_init(struct cpuinfo_x86 *ci)
556 {
557 int rc = 0;
559 switch (ci->x86) {
560 case 6:
561 rc = amd_k7_mcheck_init(ci);
562 break;
564 default:
565 /* Assume that machine check support is available.
566 * The minimum provided support is at least the K8. */
567 case 0xf:
568 rc = amd_k8_mcheck_init(ci);
569 break;
571 case 0x10:
572 case 0x11:
573 rc = amd_f10_mcheck_init(ci);
574 break;
575 }
577 return rc;
578 }
580 /*check the existence of Machine Check*/
581 int mce_available(struct cpuinfo_x86 *c)
582 {
583 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
584 }
586 /*
587 * Check if bank 0 is usable for MCE. It isn't for AMD K7,
588 * and Intel P6 family before model 0x1a.
589 */
590 int mce_firstbank(struct cpuinfo_x86 *c)
591 {
592 if (c->x86 == 6) {
593 if (c->x86_vendor == X86_VENDOR_AMD)
594 return 1;
596 if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
597 return 1;
598 }
600 return 0;
601 }
603 /* This has to be run for each processor */
604 void mcheck_init(struct cpuinfo_x86 *c)
605 {
606 int inited = 0, i;
608 if (mce_disabled == 1) {
609 printk(XENLOG_INFO "MCE support disabled by bootparam\n");
610 return;
611 }
613 for (i = 0; i < MAX_NR_BANKS; i++)
614 set_bit(i,mca_allbanks);
616 /* Enforce at least MCE support in CPUID information. Individual
617 * families may also need to enforce a check for MCA support. */
618 if (!cpu_has(c, X86_FEATURE_MCE)) {
619 printk(XENLOG_INFO "CPU%i: No machine check support available\n",
620 smp_processor_id());
621 return;
622 }
624 intpose_init();
625 mctelem_init(sizeof (struct mc_info));
627 switch (c->x86_vendor) {
628 case X86_VENDOR_AMD:
629 inited = amd_mcheck_init(c);
630 break;
632 case X86_VENDOR_INTEL:
633 switch (c->x86) {
634 case 6:
635 case 15:
636 inited = intel_mcheck_init(c);
637 break;
638 }
639 break;
641 default:
642 break;
643 }
645 set_poll_bankmask(c);
646 if (!inited)
647 printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
648 smp_processor_id());
649 }
651 u64 mce_cap_init(void)
652 {
653 u32 l, h;
654 u64 value;
656 rdmsr(MSR_IA32_MCG_CAP, l, h);
657 value = ((u64)h << 32) | l;
658 /* For Guest vMCE usage */
659 g_mcg_cap = value & ~MCG_CMCI_P;
661 if (l & MCG_CTL_P) /* Control register present ? */
662 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
664 nr_mce_banks = l & MCG_CAP_COUNT;
665 if ( nr_mce_banks > MAX_NR_BANKS )
666 {
667 printk(KERN_WARNING "MCE: exceed max mce banks\n");
668 g_mcg_cap = (g_mcg_cap & ~MCG_CAP_COUNT) | MAX_NR_BANKS;
669 }
671 return value;
672 }
674 /* Guest vMCE# MSRs virtualization ops (rdmsr/wrmsr) */
675 void mce_init_msr(struct domain *d)
676 {
677 d->arch.vmca_msrs.mcg_status = 0x0;
678 d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
679 d->arch.vmca_msrs.mcg_ctl = ~(uint64_t)0x0;
680 d->arch.vmca_msrs.nr_injection = 0;
681 memset(d->arch.vmca_msrs.mci_ctl, ~0,
682 sizeof(d->arch.vmca_msrs.mci_ctl));
683 INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
684 spin_lock_init(&d->arch.vmca_msrs.lock);
685 }
687 int mce_rdmsr(uint32_t msr, uint64_t *val)
688 {
689 struct domain *d = current->domain;
690 int ret = 1;
691 unsigned int bank;
692 struct bank_entry *entry = NULL;
694 *val = 0;
695 spin_lock(&d->arch.vmca_msrs.lock);
697 switch ( msr )
698 {
699 case MSR_IA32_MCG_STATUS:
700 *val = d->arch.vmca_msrs.mcg_status;
701 if (*val)
702 mce_printk(MCE_VERBOSE,
703 "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
704 break;
705 case MSR_IA32_MCG_CAP:
706 *val = d->arch.vmca_msrs.mcg_cap;
707 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
708 *val);
709 break;
710 case MSR_IA32_MCG_CTL:
711 *val = d->arch.vmca_msrs.mcg_ctl;
712 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
713 *val);
714 break;
715 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
716 bank = (msr - MSR_IA32_MC0_CTL) / 4;
717 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
718 {
719 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
720 ret = 0;
721 break;
722 }
723 switch (msr & (MSR_IA32_MC0_CTL | 3))
724 {
725 case MSR_IA32_MC0_CTL:
726 *val = d->arch.vmca_msrs.mci_ctl[bank];
727 mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
728 bank, *val);
729 break;
730 case MSR_IA32_MC0_STATUS:
731 /* Only error bank is read. Non-error banks simply return. */
732 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
733 {
734 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
735 struct bank_entry, list);
736 if (entry->bank == bank) {
737 *val = entry->mci_status;
738 mce_printk(MCE_VERBOSE,
739 "MCE: rd MC%u_STATUS in vMCE# context "
740 "value 0x%"PRIx64"\n", bank, *val);
741 }
742 else
743 entry = NULL;
744 }
745 break;
746 case MSR_IA32_MC0_ADDR:
747 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
748 {
749 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
750 struct bank_entry, list);
751 if ( entry->bank == bank )
752 {
753 *val = entry->mci_addr;
754 mce_printk(MCE_VERBOSE,
755 "MCE: rdmsr MC%u_ADDR in vMCE# context "
756 "0x%"PRIx64"\n", bank, *val);
757 }
758 }
759 break;
760 case MSR_IA32_MC0_MISC:
761 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
762 {
763 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
764 struct bank_entry, list);
765 if ( entry->bank == bank )
766 {
767 *val = entry->mci_misc;
768 mce_printk(MCE_VERBOSE,
769 "MCE: rd MC%u_MISC in vMCE# context "
770 "0x%"PRIx64"\n", bank, *val);
771 }
772 }
773 break;
774 }
775 break;
776 default:
777 switch ( boot_cpu_data.x86_vendor )
778 {
779 case X86_VENDOR_INTEL:
780 ret = intel_mce_rdmsr(msr, val);
781 break;
782 default:
783 ret = 0;
784 break;
785 }
786 break;
787 }
789 spin_unlock(&d->arch.vmca_msrs.lock);
790 return ret;
791 }
793 int mce_wrmsr(u32 msr, u64 val)
794 {
795 struct domain *d = current->domain;
796 struct bank_entry *entry = NULL;
797 unsigned int bank;
798 int ret = 1;
800 if ( !g_mcg_cap )
801 return 0;
803 spin_lock(&d->arch.vmca_msrs.lock);
805 switch ( msr )
806 {
807 case MSR_IA32_MCG_CTL:
808 if ( val && (val + 1) )
809 {
810 mce_printk(MCE_QUIET, "MCE: val \"%"PRIx64"\" written "
811 "to MCG_CTL should be all 0s or 1s\n", val);
812 ret = -1;
813 break;
814 }
815 d->arch.vmca_msrs.mcg_ctl = val;
816 break;
817 case MSR_IA32_MCG_STATUS:
818 d->arch.vmca_msrs.mcg_status = val;
819 mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
820 /* For HVM guest, this is the point for deleting vMCE injection node */
821 if ( d->is_hvm && (d->arch.vmca_msrs.nr_injection > 0) )
822 {
823 d->arch.vmca_msrs.nr_injection--; /* Should be 0 */
824 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
825 {
826 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
827 struct bank_entry, list);
828 if ( entry->mci_status & MCi_STATUS_VAL )
829 mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have "
830 "been cleared before write MCG_STATUS MSR\n");
832 mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
833 "Node, nr_injection %u\n",
834 d->arch.vmca_msrs.nr_injection);
835 list_del(&entry->list);
836 xfree(entry);
837 }
838 else
839 mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
840 " last injection Node, something Wrong!\n");
841 }
842 break;
843 case MSR_IA32_MCG_CAP:
844 mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
845 ret = -1;
846 break;
847 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
848 bank = (msr - MSR_IA32_MC0_CTL) / 4;
849 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
850 {
851 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
852 ret = 0;
853 break;
854 }
855 switch ( msr & (MSR_IA32_MC0_CTL | 3) )
856 {
857 case MSR_IA32_MC0_CTL:
858 if ( val && (val + 1) )
859 {
860 mce_printk(MCE_QUIET, "MCE: val written to MC%u_CTL "
861 "should be all 0s or 1s (is %"PRIx64")\n",
862 bank, val);
863 ret = -1;
864 break;
865 }
866 d->arch.vmca_msrs.mci_ctl[bank] = val;
867 break;
868 case MSR_IA32_MC0_STATUS:
869 /* Give the first entry of the list, it corresponds to current
870 * vMCE# injection. When vMCE# is finished processing by the
871 * the guest, this node will be deleted.
872 * Only error bank is written. Non-error banks simply return.
873 */
874 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
875 {
876 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
877 struct bank_entry, list);
878 if ( entry->bank == bank )
879 entry->mci_status = val;
880 mce_printk(MCE_VERBOSE,
881 "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
882 bank, val);
883 }
884 else
885 mce_printk(MCE_VERBOSE,
886 "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
887 break;
888 case MSR_IA32_MC0_ADDR:
889 mce_printk(MCE_QUIET, "MCE: MC%u_ADDR is read-only\n", bank);
890 ret = -1;
891 break;
892 case MSR_IA32_MC0_MISC:
893 mce_printk(MCE_QUIET, "MCE: MC%u_MISC is read-only\n", bank);
894 ret = -1;
895 break;
896 }
897 break;
898 default:
899 switch ( boot_cpu_data.x86_vendor )
900 {
901 case X86_VENDOR_INTEL:
902 ret = intel_mce_wrmsr(msr, val);
903 break;
904 default:
905 ret = 0;
906 break;
907 }
908 break;
909 }
911 spin_unlock(&d->arch.vmca_msrs.lock);
912 return ret;
913 }
915 static void mcinfo_clear(struct mc_info *mi)
916 {
917 memset(mi, 0, sizeof(struct mc_info));
918 x86_mcinfo_nentries(mi) = 0;
919 }
921 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
922 {
923 int i;
924 unsigned long end1, end2;
925 struct mcinfo_common *mic, *mic_base, *mic_index;
927 mic = (struct mcinfo_common *)mcinfo;
928 mic_index = mic_base = x86_mcinfo_first(mi);
930 /* go to first free entry */
931 for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
932 mic_index = x86_mcinfo_next(mic_index);
933 }
935 /* check if there is enough size */
936 end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
937 end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
939 if (end1 < end2)
940 return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
942 /* there's enough space. add entry. */
943 memcpy(mic_index, mic, mic->size);
944 x86_mcinfo_nentries(mi)++;
946 return 0;
947 }
949 /* Dump machine check information in a format,
950 * mcelog can parse. This is used only when
951 * Dom0 does not take the notification. */
952 void x86_mcinfo_dump(struct mc_info *mi)
953 {
954 struct mcinfo_common *mic = NULL;
955 struct mcinfo_global *mc_global;
956 struct mcinfo_bank *mc_bank;
958 /* first print the global info */
959 x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
960 if (mic == NULL)
961 return;
962 mc_global = (struct mcinfo_global *)mic;
963 if (mc_global->mc_flags & MC_FLAG_MCE) {
964 printk(XENLOG_WARNING
965 "CPU%d: Machine Check Exception: %16"PRIx64"\n",
966 mc_global->mc_coreid, mc_global->mc_gstatus);
967 } else {
968 printk(XENLOG_WARNING "MCE: The hardware reports a non "
969 "fatal, correctable incident occurred on "
970 "CPU %d.\n",
971 mc_global->mc_coreid);
972 }
974 /* then the bank information */
975 x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
976 do {
977 if (mic == NULL)
978 return;
979 if (mic->type != MC_TYPE_BANK)
980 goto next;
982 mc_bank = (struct mcinfo_bank *)mic;
984 printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
985 mc_bank->mc_bank,
986 mc_bank->mc_status);
987 if (mc_bank->mc_status & MCi_STATUS_MISCV)
988 printk("[%16"PRIx64"]", mc_bank->mc_misc);
989 if (mc_bank->mc_status & MCi_STATUS_ADDRV)
990 printk(" at %16"PRIx64, mc_bank->mc_addr);
992 printk("\n");
993 next:
994 mic = x86_mcinfo_next(mic); /* next entry */
995 if ((mic == NULL) || (mic->size == 0))
996 break;
997 } while (1);
998 }
1000 static void do_mc_get_cpu_info(void *v)
1002 int cpu = smp_processor_id();
1003 int cindex, cpn;
1004 struct cpuinfo_x86 *c;
1005 xen_mc_logical_cpu_t *log_cpus, *xcp;
1006 uint32_t junk, ebx;
1008 log_cpus = v;
1009 c = &cpu_data[cpu];
1010 cindex = 0;
1011 cpn = cpu - 1;
1013 /*
1014 * Deal with sparse masks, condensed into a contig array.
1015 */
1016 while (cpn >= 0) {
1017 if (cpu_isset(cpn, cpu_online_map))
1018 cindex++;
1019 cpn--;
1022 xcp = &log_cpus[cindex];
1023 c = &cpu_data[cpu];
1024 xcp->mc_cpunr = cpu;
1025 x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
1026 &xcp->mc_coreid, &xcp->mc_threadid,
1027 &xcp->mc_apicid, &xcp->mc_ncores,
1028 &xcp->mc_ncores_active, &xcp->mc_nthreads);
1029 xcp->mc_cpuid_level = c->cpuid_level;
1030 xcp->mc_family = c->x86;
1031 xcp->mc_vendor = c->x86_vendor;
1032 xcp->mc_model = c->x86_model;
1033 xcp->mc_step = c->x86_mask;
1034 xcp->mc_cache_size = c->x86_cache_size;
1035 xcp->mc_cache_alignment = c->x86_cache_alignment;
1036 memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
1037 memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
1038 memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
1040 /*
1041 * This part needs to run on the CPU itself.
1042 */
1043 xcp->mc_nmsrvals = __MC_NMSRS;
1044 xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
1045 rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
1047 if (c->cpuid_level >= 1) {
1048 cpuid(1, &junk, &ebx, &junk, &junk);
1049 xcp->mc_clusterid = (ebx >> 24) & 0xff;
1050 } else
1051 xcp->mc_clusterid = hard_smp_processor_id();
1055 void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
1056 uint16_t *threadid, uint32_t *apicid,
1057 unsigned *ncores, unsigned *ncores_active,
1058 unsigned *nthreads)
1060 struct cpuinfo_x86 *c;
1062 *apicid = cpu_physical_id(cpu);
1063 c = &cpu_data[cpu];
1064 if (c->apicid == BAD_APICID) {
1065 *chipid = cpu;
1066 *coreid = 0;
1067 *threadid = 0;
1068 if (ncores != NULL)
1069 *ncores = 1;
1070 if (ncores_active != NULL)
1071 *ncores_active = 1;
1072 if (nthreads != NULL)
1073 *nthreads = 1;
1074 } else {
1075 *chipid = phys_proc_id[cpu];
1076 if (c->x86_max_cores > 1)
1077 *coreid = cpu_core_id[cpu];
1078 else
1079 *coreid = 0;
1080 *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
1081 if (ncores != NULL)
1082 *ncores = c->x86_max_cores;
1083 if (ncores_active != NULL)
1084 *ncores_active = c->booted_cores;
1085 if (nthreads != NULL)
1086 *nthreads = c->x86_num_siblings;
1090 #define INTPOSE_NENT 50
1092 static struct intpose_ent {
1093 unsigned int cpu_nr;
1094 uint64_t msr;
1095 uint64_t val;
1096 } intpose_arr[INTPOSE_NENT];
1098 static void intpose_init(void)
1100 static int done;
1101 int i;
1103 if (done++ > 0)
1104 return;
1106 for (i = 0; i < INTPOSE_NENT; i++) {
1107 intpose_arr[i].cpu_nr = -1;
1112 struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
1113 uint64_t *valp)
1115 int i;
1117 for (i = 0; i < INTPOSE_NENT; i++) {
1118 if (intpose_arr[i].cpu_nr == cpu_nr &&
1119 intpose_arr[i].msr == msr) {
1120 if (valp != NULL)
1121 *valp = intpose_arr[i].val;
1122 return &intpose_arr[i];
1126 return NULL;
1129 static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
1131 struct intpose_ent *ent;
1132 int i;
1134 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1135 ent->val = val;
1136 return;
1139 for (i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++) {
1140 if (ent->cpu_nr == -1) {
1141 ent->cpu_nr = cpu_nr;
1142 ent->msr = msr;
1143 ent->val = val;
1144 return;
1148 printk("intpose_add: interpose array full - request dropped\n");
1151 void intpose_inval(unsigned int cpu_nr, uint64_t msr)
1153 struct intpose_ent *ent;
1155 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1156 ent->cpu_nr = -1;
1160 #define IS_MCA_BANKREG(r) \
1161 ((r) >= MSR_IA32_MC0_CTL && \
1162 (r) <= MSR_IA32_MC0_MISC + (nr_mce_banks - 1) * 4 && \
1163 ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
1165 static int x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
1167 struct cpuinfo_x86 *c;
1168 int i, errs = 0;
1170 c = &cpu_data[smp_processor_id()];
1172 for (i = 0; i < mci->mcinj_count; i++) {
1173 uint64_t reg = mci->mcinj_msr[i].reg;
1174 const char *reason = NULL;
1176 if (IS_MCA_BANKREG(reg)) {
1177 if (c->x86_vendor == X86_VENDOR_AMD) {
1178 /* On AMD we can set MCi_STATUS_WREN in the
1179 * HWCR MSR to allow non-zero writes to banks
1180 * MSRs not to #GP. The injector in dom0
1181 * should set that bit, but we detect when it
1182 * is necessary and set it as a courtesy to
1183 * avoid #GP in the hypervisor. */
1184 mci->mcinj_flags |=
1185 _MC_MSRINJ_F_REQ_HWCR_WREN;
1186 continue;
1187 } else {
1188 /* No alternative but to interpose, so require
1189 * that the injector specified as such. */
1190 if (!(mci->mcinj_flags &
1191 MC_MSRINJ_F_INTERPOSE)) {
1192 reason = "must specify interposition";
1195 } else {
1196 switch (reg) {
1197 /* MSRs acceptable on all x86 cpus */
1198 case MSR_IA32_MCG_STATUS:
1199 break;
1201 /* MSRs that the HV will take care of */
1202 case MSR_K8_HWCR:
1203 if (c->x86_vendor == X86_VENDOR_AMD)
1204 reason = "HV will operate HWCR";
1205 else
1206 reason ="only supported on AMD";
1207 break;
1209 default:
1210 reason = "not a recognized MCA MSR";
1211 break;
1215 if (reason != NULL) {
1216 printk("HV MSR INJECT ERROR: MSR 0x%llx %s\n",
1217 (unsigned long long)mci->mcinj_msr[i].reg, reason);
1218 errs++;
1222 return !errs;
1225 static uint64_t x86_mc_hwcr_wren(void)
1227 uint64_t old;
1229 rdmsrl(MSR_K8_HWCR, old);
1231 if (!(old & K8_HWCR_MCi_STATUS_WREN)) {
1232 uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
1233 wrmsrl(MSR_K8_HWCR, new);
1236 return old;
1239 static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
1241 if (!(hwcr & K8_HWCR_MCi_STATUS_WREN))
1242 wrmsrl(MSR_K8_HWCR, hwcr);
1245 static void x86_mc_msrinject(void *data)
1247 struct xen_mc_msrinject *mci = data;
1248 struct mcinfo_msr *msr;
1249 struct cpuinfo_x86 *c;
1250 uint64_t hwcr = 0;
1251 int intpose;
1252 int i;
1254 c = &cpu_data[smp_processor_id()];
1256 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1257 hwcr = x86_mc_hwcr_wren();
1259 intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
1261 for (i = 0, msr = &mci->mcinj_msr[0];
1262 i < mci->mcinj_count; i++, msr++) {
1263 printk("HV MSR INJECT (%s) target %u actual %u MSR 0x%llx "
1264 "<-- 0x%llx\n",
1265 intpose ? "interpose" : "hardware",
1266 mci->mcinj_cpunr, smp_processor_id(),
1267 (unsigned long long)msr->reg,
1268 (unsigned long long)msr->value);
1270 if (intpose)
1271 intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
1272 else
1273 wrmsrl(msr->reg, msr->value);
1276 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1277 x86_mc_hwcr_wren_restore(hwcr);
1280 /*ARGSUSED*/
1281 static void x86_mc_mceinject(void *data)
1283 printk("Simulating #MC on cpu %d\n", smp_processor_id());
1284 __asm__ __volatile__("int $0x12");
1287 #if BITS_PER_LONG == 64
1289 #define ID2COOKIE(id) ((mctelem_cookie_t)(id))
1290 #define COOKIE2ID(c) ((uint64_t)(c))
1292 #elif BITS_PER_LONG == 32
1294 #define ID2COOKIE(id) ((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU))
1295 #define COOKIE2ID(c) ((uint64_t)(uint32_t)(c))
1297 #elif defined(BITS_PER_LONG)
1298 #error BITS_PER_LONG has unexpected value
1299 #else
1300 #error BITS_PER_LONG definition absent
1301 #endif
1303 #ifdef CONFIG_COMPAT
1304 # include <compat/arch-x86/xen-mca.h>
1306 # define xen_mcinfo_msr mcinfo_msr
1307 CHECK_mcinfo_msr;
1308 # undef xen_mcinfo_msr
1309 # undef CHECK_mcinfo_msr
1310 # define CHECK_mcinfo_msr struct mcinfo_msr
1312 # define xen_mcinfo_common mcinfo_common
1313 CHECK_mcinfo_common;
1314 # undef xen_mcinfo_common
1315 # undef CHECK_mcinfo_common
1316 # define CHECK_mcinfo_common struct mcinfo_common
1318 CHECK_FIELD_(struct, mc_fetch, flags);
1319 CHECK_FIELD_(struct, mc_fetch, fetch_id);
1320 # define CHECK_compat_mc_fetch struct mc_fetch
1322 CHECK_FIELD_(struct, mc_physcpuinfo, ncpus);
1323 # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo
1325 CHECK_mc;
1326 # undef CHECK_compat_mc_fetch
1327 # undef CHECK_compat_mc_physcpuinfo
1329 # define xen_mc_info mc_info
1330 CHECK_mc_info;
1331 # undef xen_mc_info
1333 # define xen_mcinfo_global mcinfo_global
1334 CHECK_mcinfo_global;
1335 # undef xen_mcinfo_global
1337 # define xen_mcinfo_bank mcinfo_bank
1338 CHECK_mcinfo_bank;
1339 # undef xen_mcinfo_bank
1341 # define xen_mcinfo_extended mcinfo_extended
1342 CHECK_mcinfo_extended;
1343 # undef xen_mcinfo_extended
1345 # define xen_mcinfo_recovery mcinfo_recovery
1346 # define xen_cpu_offline_action cpu_offline_action
1347 # define xen_page_offline_action page_offline_action
1348 CHECK_mcinfo_recovery;
1349 # undef xen_cpu_offline_action
1350 # undef xen_page_offline_action
1351 # undef xen_mcinfo_recovery
1352 #else
1353 # define compat_mc_fetch xen_mc_fetch
1354 # define compat_mc_physcpuinfo xen_mc_physcpuinfo
1355 # define compat_handle_is_null guest_handle_is_null
1356 # define copy_to_compat copy_to_guest
1357 #endif
1359 /* Machine Check Architecture Hypercall */
1360 long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
1362 long ret = 0;
1363 struct xen_mc curop, *op = &curop;
1364 struct vcpu *v = current;
1365 union {
1366 struct xen_mc_fetch *nat;
1367 struct compat_mc_fetch *cmp;
1368 } mc_fetch;
1369 union {
1370 struct xen_mc_physcpuinfo *nat;
1371 struct compat_mc_physcpuinfo *cmp;
1372 } mc_physcpuinfo;
1373 uint32_t flags, cmdflags;
1374 int nlcpu;
1375 xen_mc_logical_cpu_t *log_cpus = NULL;
1376 mctelem_cookie_t mctc;
1377 mctelem_class_t which;
1378 unsigned int target;
1379 struct xen_mc_msrinject *mc_msrinject;
1380 struct xen_mc_mceinject *mc_mceinject;
1382 if (!IS_PRIV(v->domain) )
1383 return x86_mcerr(NULL, -EPERM);
1385 if ( copy_from_guest(op, u_xen_mc, 1) )
1386 return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
1388 if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
1389 return x86_mcerr("do_mca: interface version mismatch", -EACCES);
1391 switch (op->cmd) {
1392 case XEN_MC_fetch:
1393 mc_fetch.nat = &op->u.mc_fetch;
1394 cmdflags = mc_fetch.nat->flags;
1396 switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
1397 case XEN_MC_NONURGENT:
1398 which = MC_NONURGENT;
1399 break;
1401 case XEN_MC_URGENT:
1402 which = MC_URGENT;
1403 break;
1405 default:
1406 return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
1409 flags = XEN_MC_OK;
1411 if (cmdflags & XEN_MC_ACK) {
1412 mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
1413 mctelem_ack(which, cookie);
1414 } else {
1415 if (!is_pv_32on64_vcpu(v)
1416 ? guest_handle_is_null(mc_fetch.nat->data)
1417 : compat_handle_is_null(mc_fetch.cmp->data))
1418 return x86_mcerr("do_mca fetch: guest buffer "
1419 "invalid", -EINVAL);
1421 if ((mctc = mctelem_consume_oldest_begin(which))) {
1422 struct mc_info *mcip = mctelem_dataptr(mctc);
1423 if (!is_pv_32on64_vcpu(v)
1424 ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
1425 : copy_to_compat(mc_fetch.cmp->data,
1426 mcip, 1)) {
1427 ret = -EFAULT;
1428 flags |= XEN_MC_FETCHFAILED;
1429 mc_fetch.nat->fetch_id = 0;
1430 } else {
1431 mc_fetch.nat->fetch_id = COOKIE2ID(mctc);
1433 mctelem_consume_oldest_end(mctc);
1434 } else {
1435 /* There is no data */
1436 flags |= XEN_MC_NODATA;
1437 mc_fetch.nat->fetch_id = 0;
1440 mc_fetch.nat->flags = flags;
1441 if (copy_to_guest(u_xen_mc, op, 1) != 0)
1442 ret = -EFAULT;
1445 break;
1447 case XEN_MC_notifydomain:
1448 return x86_mcerr("do_mca notify unsupported", -EINVAL);
1450 case XEN_MC_physcpuinfo:
1451 mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
1452 nlcpu = num_online_cpus();
1454 if (!is_pv_32on64_vcpu(v)
1455 ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
1456 : !compat_handle_is_null(mc_physcpuinfo.cmp->info)) {
1457 if (mc_physcpuinfo.nat->ncpus <= 0)
1458 return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
1459 -EINVAL);
1460 nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus);
1461 log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
1462 if (log_cpus == NULL)
1463 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
1465 if (on_each_cpu(do_mc_get_cpu_info, log_cpus, 1)) {
1466 xfree(log_cpus);
1467 return x86_mcerr("do_mca cpuinfo", -EIO);
1469 if (!is_pv_32on64_vcpu(v)
1470 ? copy_to_guest(mc_physcpuinfo.nat->info,
1471 log_cpus, nlcpu)
1472 : copy_to_compat(mc_physcpuinfo.cmp->info,
1473 log_cpus, nlcpu))
1474 ret = -EFAULT;
1475 xfree(log_cpus);
1478 mc_physcpuinfo.nat->ncpus = nlcpu;
1480 if (copy_to_guest(u_xen_mc, op, 1))
1481 return x86_mcerr("do_mca cpuinfo", -EFAULT);
1483 break;
1485 case XEN_MC_msrinject:
1486 if (nr_mce_banks == 0)
1487 return x86_mcerr("do_mca inject", -ENODEV);
1489 mc_msrinject = &op->u.mc_msrinject;
1490 target = mc_msrinject->mcinj_cpunr;
1492 if (target >= NR_CPUS)
1493 return x86_mcerr("do_mca inject: bad target", -EINVAL);
1495 if (!cpu_isset(target, cpu_online_map))
1496 return x86_mcerr("do_mca inject: target offline",
1497 -EINVAL);
1499 if (mc_msrinject->mcinj_count == 0)
1500 return 0;
1502 if (!x86_mc_msrinject_verify(mc_msrinject))
1503 return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
1505 add_taint(TAINT_ERROR_INJECT);
1507 on_selected_cpus(cpumask_of(target), x86_mc_msrinject,
1508 mc_msrinject, 1);
1510 break;
1512 case XEN_MC_mceinject:
1513 if (nr_mce_banks == 0)
1514 return x86_mcerr("do_mca #MC", -ENODEV);
1516 mc_mceinject = &op->u.mc_mceinject;
1517 target = mc_mceinject->mceinj_cpunr;
1519 if (target >= NR_CPUS)
1520 return x86_mcerr("do_mca #MC: bad target", -EINVAL);
1522 if (!cpu_isset(target, cpu_online_map))
1523 return x86_mcerr("do_mca #MC: target offline", -EINVAL);
1525 add_taint(TAINT_ERROR_INJECT);
1527 on_selected_cpus(cpumask_of(target), x86_mc_mceinject,
1528 mc_mceinject, 1);
1529 break;
1531 default:
1532 return x86_mcerr("do_mca: bad command", -EINVAL);
1535 return ret;
1537 void set_poll_bankmask(struct cpuinfo_x86 *c)
1540 if (cmci_support && !mce_disabled) {
1541 memcpy(&(__get_cpu_var(poll_bankmask)),
1542 &(__get_cpu_var(no_cmci_banks)), sizeof(cpu_banks_t));
1544 else {
1545 memcpy(&(get_cpu_var(poll_bankmask)), &mca_allbanks, sizeof(cpu_banks_t));
1546 if (mce_firstbank(c))
1547 clear_bit(0, get_cpu_var(poll_bankmask));
1550 void mc_panic(char *s)
1552 is_mc_panic = 1;
1553 console_start_sync();
1554 printk("Fatal machine check: %s\n", s);
1555 printk("\n"
1556 "****************************************\n"
1557 "\n"
1558 " The processor has reported a hardware error which cannot\n"
1559 " be recovered from. Xen will now reboot the machine.\n");
1560 panic("HARDWARE ERROR");