debuggers.hg

view xen/arch/x86/cpu/mcheck/mce.c @ 20908:7310235f74f8

x86 mca: Not GP fault when guest write non 0s or 1s to MCA CTL MSRs.

a) For Mci_CTL MSR, Guest can write any value to it. When read back,
it will be ANDed with the physical value. Some bit in physical value
can be 0, either because read-only in hardware (like masked by AMD's
Mci_CTL_MASK), or because Xen didn't enable it.
If guest write some bit as 0, while that bit is 1 in host, we will
not inject MCE corresponding that bank to guest, as we can't
distinguish if the MCE is caused by the guest-cleared bit.

b) For MCG_CTL MSR, guest can write any value to it. When read back,
it will be ANDed with the physical value.
If guest does not write all 1s. In mca_ctl_conflict(), we simply
not inject any vMCE to guest if some bit is set in physical MSR
while is cleared in guest 's vMCG_CTL MSR.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jan 29 06:48:00 2010 +0000 (2010-01-29)
parents f85120520509
children 088f1b01d852
line source
1 /*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4 */
6 #include <xen/init.h>
7 #include <xen/types.h>
8 #include <xen/kernel.h>
9 #include <xen/config.h>
10 #include <xen/smp.h>
11 #include <xen/errno.h>
12 #include <xen/console.h>
13 #include <xen/sched.h>
14 #include <xen/sched-if.h>
15 #include <xen/cpumask.h>
16 #include <xen/event.h>
17 #include <xen/guest_access.h>
18 #include <xen/hypercall.h> /* for do_mca */
20 #include <asm/processor.h>
21 #include <asm/system.h>
22 #include <asm/msr.h>
24 #include "mce.h"
26 int mce_disabled;
27 invbool_param("mce", mce_disabled);
29 int is_mc_panic;
30 unsigned int nr_mce_banks;
32 static uint64_t g_mcg_cap;
34 /* Real value in physical CTL MSR */
35 static uint64_t h_mcg_ctl = 0UL;
36 static uint64_t *h_mci_ctrl;
37 int firstbank;
39 static void intpose_init(void);
40 static void mcinfo_clear(struct mc_info *);
42 #define SEG_PL(segsel) ((segsel) & 0x3)
43 #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16)
45 #if 0
46 static int x86_mcerr(const char *msg, int err)
47 {
48 gdprintk(XENLOG_WARNING, "x86_mcerr: %s, returning %d\n",
49 msg != NULL ? msg : "", err);
50 return err;
51 }
52 #else
53 #define x86_mcerr(msg, err) (err)
54 #endif
56 cpu_banks_t mca_allbanks;
58 int mce_verbosity;
59 static void __init mce_set_verbosity(char *str)
60 {
61 if (strcmp("verbose", str) == 0)
62 mce_verbosity = MCE_VERBOSE;
63 else
64 printk(KERN_DEBUG "Machine Check verbosity level %s not recognised"
65 "use mce_verbosity=verbose", str);
66 }
67 custom_param("mce_verbosity", mce_set_verbosity);
69 /* Handle unconfigured int18 (should never happen) */
70 static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
71 {
72 printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
73 smp_processor_id());
74 }
77 static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
79 void x86_mce_vector_register(x86_mce_vector_t hdlr)
80 {
81 _machine_check_vector = hdlr;
82 wmb();
83 }
85 /* Call the installed machine check handler for this CPU setup. */
87 void machine_check_vector(struct cpu_user_regs *regs, long error_code)
88 {
89 _machine_check_vector(regs, error_code);
90 }
92 /* Init machine check callback handler
93 * It is used to collect additional information provided by newer
94 * CPU families/models without the need to duplicate the whole handler.
95 * This avoids having many handlers doing almost nearly the same and each
96 * with its own tweaks ands bugs. */
97 static x86_mce_callback_t mc_callback_bank_extended = NULL;
99 void x86_mce_callback_register(x86_mce_callback_t cbfunc)
100 {
101 mc_callback_bank_extended = cbfunc;
102 }
104 /* Machine check recoverable judgement callback handler
105 * It is used to judge whether an UC error is recoverable by software
106 */
107 static mce_recoverable_t mc_recoverable_scan = NULL;
109 void mce_recoverable_register(mce_recoverable_t cbfunc)
110 {
111 mc_recoverable_scan = cbfunc;
112 }
114 /* Judging whether to Clear Machine Check error bank callback handler
115 * According to Intel latest MCA OS Recovery Writer's Guide,
116 * whether the error MCA bank needs to be cleared is decided by the mca_source
117 * and MCi_status bit value.
118 */
119 static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
121 void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
122 {
123 mc_need_clearbank_scan = cbfunc;
124 }
126 /* Utility function to perform MCA bank telemetry readout and to push that
127 * telemetry towards an interested dom0 for logging and diagnosis.
128 * The caller - #MC handler or MCA poll function - must arrange that we
129 * do not migrate cpus. */
131 /* XXFM Could add overflow counting? */
133 /* Add out_param clear_bank for Machine Check Handler Caller.
134 * For Intel latest CPU, whether to clear the error bank status needs to
135 * be judged by the callback function defined above.
136 */
137 mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
138 struct mca_summary *sp, cpu_banks_t* clear_bank)
139 {
140 struct vcpu *v = current;
141 struct domain *d;
142 uint64_t gstatus, status, addr, misc;
143 struct mcinfo_global mcg; /* on stack */
144 struct mcinfo_common *mic;
145 struct mcinfo_global *mig; /* on stack */
146 mctelem_cookie_t mctc = NULL;
147 uint32_t uc = 0, pcc = 0, recover, need_clear = 1 ;
148 struct mc_info *mci = NULL;
149 mctelem_class_t which = MC_URGENT; /* XXXgcc */
150 unsigned int cpu_nr;
151 int errcnt = 0;
152 int i;
153 enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
155 cpu_nr = smp_processor_id();
156 BUG_ON(cpu_nr != v->processor);
158 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
160 memset(&mcg, 0, sizeof (mcg));
161 mcg.common.type = MC_TYPE_GLOBAL;
162 mcg.common.size = sizeof (mcg);
163 if (v != NULL && ((d = v->domain) != NULL)) {
164 mcg.mc_domid = d->domain_id;
165 mcg.mc_vcpuid = v->vcpu_id;
166 } else {
167 mcg.mc_domid = -1;
168 mcg.mc_vcpuid = -1;
169 }
170 mcg.mc_gstatus = gstatus; /* MCG_STATUS */
172 switch (who) {
173 case MCA_MCE_HANDLER:
174 case MCA_MCE_SCAN:
175 mcg.mc_flags = MC_FLAG_MCE;
176 which = MC_URGENT;
177 break;
179 case MCA_POLLER:
180 case MCA_RESET:
181 mcg.mc_flags = MC_FLAG_POLLED;
182 which = MC_NONURGENT;
183 break;
185 case MCA_CMCI_HANDLER:
186 mcg.mc_flags = MC_FLAG_CMCI;
187 which = MC_NONURGENT;
188 break;
190 default:
191 BUG();
192 }
194 /* Retrieve detector information */
195 x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
196 &mcg.mc_coreid, &mcg.mc_core_threadid,
197 &mcg.mc_apicid, NULL, NULL, NULL);
199 /* If no mc_recovery_scan callback handler registered,
200 * this error is not recoverable
201 */
202 recover = (mc_recoverable_scan)? 1: 0;
204 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
205 struct mcinfo_bank mcb; /* on stack */
207 /* Skip bank if corresponding bit in bankmask is clear */
208 if (!test_bit(i, bankmask))
209 continue;
211 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
212 if (!(status & MCi_STATUS_VAL))
213 continue; /* this bank has no valid telemetry */
215 /* For Intel Latest CPU CMCI/MCE Handler caller, we need to
216 * decide whether to clear bank by MCi_STATUS bit value such as
217 * OVER/UC/EN/PCC/S/AR
218 */
219 if ( mc_need_clearbank_scan )
220 need_clear = mc_need_clearbank_scan(who, status);
222 /* If this is the first bank with valid MCA DATA, then
223 * try to reserve an entry from the urgent/nonurgent queue
224 * depending on whethere we are called from an exception or
225 * a poller; this can fail (for example dom0 may not
226 * yet have consumed past telemetry). */
227 if (errcnt == 0) {
228 if ((mctc = mctelem_reserve(which)) != NULL) {
229 mci = mctelem_dataptr(mctc);
230 mcinfo_clear(mci);
231 }
232 }
234 memset(&mcb, 0, sizeof (mcb));
235 mcb.common.type = MC_TYPE_BANK;
236 mcb.common.size = sizeof (mcb);
237 mcb.mc_bank = i;
238 mcb.mc_status = status;
240 /* form a mask of which banks have logged uncorrected errors */
241 if ((status & MCi_STATUS_UC) != 0)
242 uc |= (1 << i);
244 /* likewise for those with processor context corrupt */
245 if ((status & MCi_STATUS_PCC) != 0)
246 pcc |= (1 << i);
248 if (recover && uc)
249 /* uc = 1, recover = 1, we need not panic.
250 */
251 recover = mc_recoverable_scan(status);
253 addr = misc = 0;
255 if (status & MCi_STATUS_ADDRV) {
256 mca_rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
257 if (mfn_valid(paddr_to_pfn(addr))) {
258 d = maddr_get_owner(addr);
259 if (d != NULL && (who == MCA_POLLER ||
260 who == MCA_CMCI_HANDLER))
261 mcb.mc_domid = d->domain_id;
262 }
263 }
265 if (status & MCi_STATUS_MISCV)
266 mca_rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
268 mcb.mc_addr = addr;
269 mcb.mc_misc = misc;
271 if (who == MCA_CMCI_HANDLER) {
272 mca_rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
273 rdtscll(mcb.mc_tsc);
274 }
276 /* Increment the error count; if this is the first bank
277 * with a valid error then add the global info to the mcinfo. */
278 if (errcnt++ == 0 && mci != NULL)
279 x86_mcinfo_add(mci, &mcg);
281 /* Add the bank data */
282 if (mci != NULL)
283 x86_mcinfo_add(mci, &mcb);
285 if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
286 cbret = mc_callback_bank_extended(mci, i, status);
287 }
289 /* By default, need_clear = 1 */
290 if (who != MCA_MCE_SCAN && need_clear)
291 /* Clear status */
292 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
293 else if ( who == MCA_MCE_SCAN && need_clear)
294 set_bit(i, clear_bank);
296 wmb();
297 }
299 if (mci != NULL && errcnt > 0) {
300 x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
301 mig = (struct mcinfo_global *)mic;
302 if (pcc)
303 mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
304 else if (uc)
305 mcg.mc_flags |= MC_FLAG_RECOVERABLE;
306 else
307 mcg.mc_flags |= MC_FLAG_CORRECTABLE;
308 }
311 if (sp) {
312 sp->errcnt = errcnt;
313 sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
314 sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
315 sp->uc = uc;
316 sp->pcc = pcc;
317 sp->recoverable = recover;
318 }
320 return mci != NULL ? mctc : NULL; /* may be NULL */
321 }
323 #define DOM_NORMAL 0
324 #define DOM0_TRAP 1
325 #define DOMU_TRAP 2
326 #define DOMU_KILLED 4
328 /* Shared #MC handler. */
329 void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
330 cpu_banks_t bankmask)
331 {
332 int xen_state_lost, dom0_state_lost, domU_state_lost;
333 struct vcpu *v = current;
334 struct domain *curdom = v->domain;
335 domid_t domid = curdom->domain_id;
336 int ctx_xen, ctx_dom0, ctx_domU;
337 uint32_t dom_state = DOM_NORMAL;
338 mctelem_cookie_t mctc = NULL;
339 struct mca_summary bs;
340 struct mc_info *mci = NULL;
341 int irqlocked = 0;
342 uint64_t gstatus;
343 int ripv;
345 /* This handler runs as interrupt gate. So IPIs from the
346 * polling service routine are defered until we're finished.
347 */
349 /* Disable interrupts for the _vcpu_. It may not re-scheduled to
350 * another physical CPU. */
351 vcpu_schedule_lock_irq(v);
352 irqlocked = 1;
354 /* Read global status; if it does not indicate machine check
355 * in progress then bail as long as we have a valid ip to return to. */
356 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
357 ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
358 if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
359 add_taint(TAINT_MACHINE_CHECK); /* questionable */
360 vcpu_schedule_unlock_irq(v);
361 irqlocked = 0;
362 goto cmn_handler_done;
363 }
365 /* Go and grab error telemetry. We must choose whether to commit
366 * for logging or dismiss the cookie that is returned, and must not
367 * reference the cookie after that action.
368 */
369 mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
370 if (mctc != NULL)
371 mci = (struct mc_info *)mctelem_dataptr(mctc);
373 /* Clear MCIP or another #MC will enter shutdown state */
374 gstatus &= ~MCG_STATUS_MCIP;
375 mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
376 wmb();
378 /* If no valid errors and our stack is intact, we're done */
379 if (ripv && bs.errcnt == 0) {
380 vcpu_schedule_unlock_irq(v);
381 irqlocked = 0;
382 goto cmn_handler_done;
383 }
385 if (bs.uc || bs.pcc)
386 add_taint(TAINT_MACHINE_CHECK);
388 /* Machine check exceptions will usually be for UC and/or PCC errors,
389 * but it is possible to configure machine check for some classes
390 * of corrected error.
391 *
392 * UC errors could compromise any domain or the hypervisor
393 * itself - for example a cache writeback of modified data that
394 * turned out to be bad could be for data belonging to anyone, not
395 * just the current domain. In the absence of known data poisoning
396 * to prevent consumption of such bad data in the system we regard
397 * all UC errors as terminal. It may be possible to attempt some
398 * heuristics based on the address affected, which guests have
399 * mappings to that mfn etc.
400 *
401 * PCC errors apply to the current context.
402 *
403 * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
404 * and not PCC is terminal - the return instruction pointer
405 * pushed onto the stack is bogus. If the interrupt context is
406 * the hypervisor or dom0 the game is over, otherwise we can
407 * limit the impact to a single domU but only if we trampoline
408 * somewhere safely - we can't return and unwind the stack.
409 * Since there is no trampoline in place we will treat !RIPV
410 * as terminal for any context.
411 */
412 ctx_xen = SEG_PL(regs->cs) == 0;
413 ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
414 ctx_domU = !ctx_xen && !ctx_dom0;
416 xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
417 !ripv;
418 dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
419 domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
421 if (xen_state_lost) {
422 /* Now we are going to panic anyway. Allow interrupts, so that
423 * printk on serial console can work. */
424 vcpu_schedule_unlock_irq(v);
425 irqlocked = 0;
427 printk("Terminal machine check exception occurred in "
428 "hypervisor context.\n");
430 /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
431 * to the error then it makes sense to print a stack trace.
432 * That can be useful for more detailed error analysis and/or
433 * error case studies to figure out, if we can clear
434 * xen_impacted and kill a DomU instead
435 * (i.e. if a guest only control structure is affected, but then
436 * we must ensure the bad pages are not re-used again).
437 */
438 if (bs.eipv & MCG_STATUS_EIPV) {
439 printk("MCE: Instruction Pointer is related to the "
440 "error, therefore print the execution state.\n");
441 show_execution_state(regs);
442 }
444 /* Commit the telemetry so that panic flow can find it. */
445 if (mctc != NULL) {
446 x86_mcinfo_dump(mci);
447 mctelem_commit(mctc);
448 }
449 mc_panic("Hypervisor state lost due to machine check "
450 "exception.\n");
451 /*NOTREACHED*/
452 }
454 /*
455 * Xen hypervisor state is intact. If dom0 state is lost then
456 * give it a chance to decide what to do if it has registered
457 * a handler for this event, otherwise panic.
458 *
459 * XXFM Could add some Solaris dom0 contract kill here?
460 */
461 if (dom0_state_lost) {
462 if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
463 dom_state = DOM0_TRAP;
464 send_guest_trap(dom0, 0, TRAP_machine_check);
465 /* XXFM case of return with !ripv ??? */
466 } else {
467 /* Commit telemetry for panic flow. */
468 if (mctc != NULL) {
469 x86_mcinfo_dump(mci);
470 mctelem_commit(mctc);
471 }
472 mc_panic("Dom0 state lost due to machine check "
473 "exception\n");
474 /*NOTREACHED*/
475 }
476 }
478 /*
479 * If a domU has lost state then send it a trap if it has registered
480 * a handler, otherwise crash the domain.
481 * XXFM Revisit this functionality.
482 */
483 if (domU_state_lost) {
484 if (guest_has_trap_callback(v->domain, v->vcpu_id,
485 TRAP_machine_check)) {
486 dom_state = DOMU_TRAP;
487 send_guest_trap(curdom, v->vcpu_id,
488 TRAP_machine_check);
489 } else {
490 dom_state = DOMU_KILLED;
491 /* Enable interrupts. This basically results in
492 * calling sti on the *physical* cpu. But after
493 * domain_crash() the vcpu pointer is invalid.
494 * Therefore, we must unlock the irqs before killing
495 * it. */
496 vcpu_schedule_unlock_irq(v);
497 irqlocked = 0;
499 /* DomU is impacted. Kill it and continue. */
500 domain_crash(curdom);
501 }
502 }
504 switch (dom_state) {
505 case DOM0_TRAP:
506 case DOMU_TRAP:
507 /* Enable interrupts. */
508 vcpu_schedule_unlock_irq(v);
509 irqlocked = 0;
511 /* guest softirqs and event callbacks are scheduled
512 * immediately after this handler exits. */
513 break;
514 case DOMU_KILLED:
515 /* Nothing to do here. */
516 break;
518 case DOM_NORMAL:
519 vcpu_schedule_unlock_irq(v);
520 irqlocked = 0;
521 break;
522 }
524 cmn_handler_done:
525 BUG_ON(irqlocked);
526 BUG_ON(!ripv);
528 if (bs.errcnt) {
529 /* Not panicing, so forward telemetry to dom0 now if it
530 * is interested. */
531 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
532 if (mctc != NULL)
533 mctelem_commit(mctc);
534 send_guest_global_virq(dom0, VIRQ_MCA);
535 } else {
536 x86_mcinfo_dump(mci);
537 if (mctc != NULL)
538 mctelem_dismiss(mctc);
539 }
540 } else if (mctc != NULL) {
541 mctelem_dismiss(mctc);
542 }
543 }
545 void mcheck_mca_clearbanks(cpu_banks_t bankmask)
546 {
547 int i;
548 uint64_t status;
550 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
551 if (!test_bit(i, bankmask))
552 continue;
553 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
554 if (!(status & MCi_STATUS_VAL))
555 continue;
556 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
557 }
558 }
560 static int amd_mcheck_init(struct cpuinfo_x86 *ci)
561 {
562 int rc = 0;
564 switch (ci->x86) {
565 case 6:
566 rc = amd_k7_mcheck_init(ci);
567 break;
569 default:
570 /* Assume that machine check support is available.
571 * The minimum provided support is at least the K8. */
572 case 0xf:
573 rc = amd_k8_mcheck_init(ci);
574 break;
576 case 0x10:
577 case 0x11:
578 rc = amd_f10_mcheck_init(ci);
579 break;
580 }
582 return rc;
583 }
585 /*check the existence of Machine Check*/
586 int mce_available(struct cpuinfo_x86 *c)
587 {
588 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
589 }
591 /*
592 * Check if bank 0 is usable for MCE. It isn't for AMD K7,
593 * and Intel P6 family before model 0x1a.
594 */
595 int mce_firstbank(struct cpuinfo_x86 *c)
596 {
597 if (c->x86 == 6) {
598 if (c->x86_vendor == X86_VENDOR_AMD)
599 return 1;
601 if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
602 return 1;
603 }
605 return 0;
606 }
608 /* This has to be run for each processor */
609 void mcheck_init(struct cpuinfo_x86 *c)
610 {
611 int inited = 0, i;
613 if (mce_disabled == 1) {
614 printk(XENLOG_INFO "MCE support disabled by bootparam\n");
615 return;
616 }
618 for (i = 0; i < MAX_NR_BANKS; i++)
619 set_bit(i,mca_allbanks);
621 /* Enforce at least MCE support in CPUID information. Individual
622 * families may also need to enforce a check for MCA support. */
623 if (!cpu_has(c, X86_FEATURE_MCE)) {
624 printk(XENLOG_INFO "CPU%i: No machine check support available\n",
625 smp_processor_id());
626 return;
627 }
629 intpose_init();
630 mctelem_init(sizeof (struct mc_info));
632 switch (c->x86_vendor) {
633 case X86_VENDOR_AMD:
634 inited = amd_mcheck_init(c);
635 break;
637 case X86_VENDOR_INTEL:
638 switch (c->x86) {
639 case 6:
640 case 15:
641 inited = intel_mcheck_init(c);
642 break;
643 }
644 break;
646 default:
647 break;
648 }
650 if ( !h_mci_ctrl )
651 {
652 h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks);
653 if (!h_mci_ctrl)
654 {
655 dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n");
656 return;
657 }
658 /* Don't care banks before firstbank */
659 memset(h_mci_ctrl, 0xff, sizeof(h_mci_ctrl));
660 for (i = firstbank; i < nr_mce_banks; i++)
661 rdmsrl(MSR_IA32_MC0_CTL + 4*i, h_mci_ctrl[i]);
662 }
663 if (g_mcg_cap & MCG_CTL_P)
664 rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl);
665 set_poll_bankmask(c);
666 if (!inited)
667 printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
668 smp_processor_id());
669 }
671 u64 mce_cap_init(void)
672 {
673 u32 l, h;
674 u64 value;
676 rdmsr(MSR_IA32_MCG_CAP, l, h);
677 value = ((u64)h << 32) | l;
678 /* For Guest vMCE usage */
679 g_mcg_cap = value & ~MCG_CMCI_P;
681 if (l & MCG_CTL_P) /* Control register present ? */
682 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
684 nr_mce_banks = l & MCG_CAP_COUNT;
685 if ( nr_mce_banks > MAX_NR_BANKS )
686 {
687 printk(KERN_WARNING "MCE: exceed max mce banks\n");
688 g_mcg_cap = (g_mcg_cap & ~MCG_CAP_COUNT) | MAX_NR_BANKS;
689 }
691 return value;
692 }
694 /* Guest vMCE# MSRs virtualization ops (rdmsr/wrmsr) */
695 void mce_init_msr(struct domain *d)
696 {
697 d->arch.vmca_msrs.mcg_status = 0x0;
698 d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
699 d->arch.vmca_msrs.mcg_ctl = ~(uint64_t)0x0;
700 d->arch.vmca_msrs.nr_injection = 0;
701 memset(d->arch.vmca_msrs.mci_ctl, ~0,
702 sizeof(d->arch.vmca_msrs.mci_ctl));
703 INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
704 spin_lock_init(&d->arch.vmca_msrs.lock);
705 }
707 int mce_rdmsr(uint32_t msr, uint64_t *val)
708 {
709 struct domain *d = current->domain;
710 int ret = 1;
711 unsigned int bank;
712 struct bank_entry *entry = NULL;
714 *val = 0;
715 spin_lock(&d->arch.vmca_msrs.lock);
717 switch ( msr )
718 {
719 case MSR_IA32_MCG_STATUS:
720 *val = d->arch.vmca_msrs.mcg_status;
721 if (*val)
722 mce_printk(MCE_VERBOSE,
723 "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
724 break;
725 case MSR_IA32_MCG_CAP:
726 *val = d->arch.vmca_msrs.mcg_cap;
727 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
728 *val);
729 break;
730 case MSR_IA32_MCG_CTL:
731 /* Always 0 if no CTL support */
732 *val = d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl;
733 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
734 *val);
735 break;
736 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
737 bank = (msr - MSR_IA32_MC0_CTL) / 4;
738 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
739 {
740 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
741 ret = 0;
742 break;
743 }
744 switch (msr & (MSR_IA32_MC0_CTL | 3))
745 {
746 case MSR_IA32_MC0_CTL:
747 *val = d->arch.vmca_msrs.mci_ctl[bank] &
748 (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
749 mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
750 bank, *val);
751 break;
752 case MSR_IA32_MC0_STATUS:
753 /* Only error bank is read. Non-error banks simply return. */
754 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
755 {
756 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
757 struct bank_entry, list);
758 if (entry->bank == bank) {
759 *val = entry->mci_status;
760 mce_printk(MCE_VERBOSE,
761 "MCE: rd MC%u_STATUS in vMCE# context "
762 "value 0x%"PRIx64"\n", bank, *val);
763 }
764 else
765 entry = NULL;
766 }
767 break;
768 case MSR_IA32_MC0_ADDR:
769 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
770 {
771 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
772 struct bank_entry, list);
773 if ( entry->bank == bank )
774 {
775 *val = entry->mci_addr;
776 mce_printk(MCE_VERBOSE,
777 "MCE: rdmsr MC%u_ADDR in vMCE# context "
778 "0x%"PRIx64"\n", bank, *val);
779 }
780 }
781 break;
782 case MSR_IA32_MC0_MISC:
783 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
784 {
785 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
786 struct bank_entry, list);
787 if ( entry->bank == bank )
788 {
789 *val = entry->mci_misc;
790 mce_printk(MCE_VERBOSE,
791 "MCE: rd MC%u_MISC in vMCE# context "
792 "0x%"PRIx64"\n", bank, *val);
793 }
794 }
795 break;
796 }
797 break;
798 default:
799 switch ( boot_cpu_data.x86_vendor )
800 {
801 case X86_VENDOR_INTEL:
802 ret = intel_mce_rdmsr(msr, val);
803 break;
804 default:
805 ret = 0;
806 break;
807 }
808 break;
809 }
811 spin_unlock(&d->arch.vmca_msrs.lock);
812 return ret;
813 }
815 int mce_wrmsr(u32 msr, u64 val)
816 {
817 struct domain *d = current->domain;
818 struct bank_entry *entry = NULL;
819 unsigned int bank;
820 int ret = 1;
822 if ( !g_mcg_cap )
823 return 0;
825 spin_lock(&d->arch.vmca_msrs.lock);
827 switch ( msr )
828 {
829 case MSR_IA32_MCG_CTL:
830 d->arch.vmca_msrs.mcg_ctl = val;
831 break;
832 case MSR_IA32_MCG_STATUS:
833 d->arch.vmca_msrs.mcg_status = val;
834 mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
835 /* For HVM guest, this is the point for deleting vMCE injection node */
836 if ( d->is_hvm && (d->arch.vmca_msrs.nr_injection > 0) )
837 {
838 d->arch.vmca_msrs.nr_injection--; /* Should be 0 */
839 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
840 {
841 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
842 struct bank_entry, list);
843 if ( entry->mci_status & MCi_STATUS_VAL )
844 mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have "
845 "been cleared before write MCG_STATUS MSR\n");
847 mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
848 "Node, nr_injection %u\n",
849 d->arch.vmca_msrs.nr_injection);
850 list_del(&entry->list);
851 xfree(entry);
852 }
853 else
854 mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
855 " last injection Node, something Wrong!\n");
856 }
857 break;
858 case MSR_IA32_MCG_CAP:
859 mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
860 ret = -1;
861 break;
862 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
863 bank = (msr - MSR_IA32_MC0_CTL) / 4;
864 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
865 {
866 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
867 ret = 0;
868 break;
869 }
870 switch ( msr & (MSR_IA32_MC0_CTL | 3) )
871 {
872 case MSR_IA32_MC0_CTL:
873 d->arch.vmca_msrs.mci_ctl[bank] = val;
874 break;
875 case MSR_IA32_MC0_STATUS:
876 /* Give the first entry of the list, it corresponds to current
877 * vMCE# injection. When vMCE# is finished processing by the
878 * the guest, this node will be deleted.
879 * Only error bank is written. Non-error banks simply return.
880 */
881 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
882 {
883 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
884 struct bank_entry, list);
885 if ( entry->bank == bank )
886 entry->mci_status = val;
887 mce_printk(MCE_VERBOSE,
888 "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
889 bank, val);
890 }
891 else
892 mce_printk(MCE_VERBOSE,
893 "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
894 break;
895 case MSR_IA32_MC0_ADDR:
896 mce_printk(MCE_QUIET, "MCE: MC%u_ADDR is read-only\n", bank);
897 ret = -1;
898 break;
899 case MSR_IA32_MC0_MISC:
900 mce_printk(MCE_QUIET, "MCE: MC%u_MISC is read-only\n", bank);
901 ret = -1;
902 break;
903 }
904 break;
905 default:
906 switch ( boot_cpu_data.x86_vendor )
907 {
908 case X86_VENDOR_INTEL:
909 ret = intel_mce_wrmsr(msr, val);
910 break;
911 default:
912 ret = 0;
913 break;
914 }
915 break;
916 }
918 spin_unlock(&d->arch.vmca_msrs.lock);
919 return ret;
920 }
922 static void mcinfo_clear(struct mc_info *mi)
923 {
924 memset(mi, 0, sizeof(struct mc_info));
925 x86_mcinfo_nentries(mi) = 0;
926 }
928 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
929 {
930 int i;
931 unsigned long end1, end2;
932 struct mcinfo_common *mic, *mic_base, *mic_index;
934 mic = (struct mcinfo_common *)mcinfo;
935 mic_index = mic_base = x86_mcinfo_first(mi);
937 /* go to first free entry */
938 for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
939 mic_index = x86_mcinfo_next(mic_index);
940 }
942 /* check if there is enough size */
943 end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
944 end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
946 if (end1 < end2)
947 return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
949 /* there's enough space. add entry. */
950 memcpy(mic_index, mic, mic->size);
951 x86_mcinfo_nentries(mi)++;
953 return 0;
954 }
956 /* Dump machine check information in a format,
957 * mcelog can parse. This is used only when
958 * Dom0 does not take the notification. */
959 void x86_mcinfo_dump(struct mc_info *mi)
960 {
961 struct mcinfo_common *mic = NULL;
962 struct mcinfo_global *mc_global;
963 struct mcinfo_bank *mc_bank;
965 /* first print the global info */
966 x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
967 if (mic == NULL)
968 return;
969 mc_global = (struct mcinfo_global *)mic;
970 if (mc_global->mc_flags & MC_FLAG_MCE) {
971 printk(XENLOG_WARNING
972 "CPU%d: Machine Check Exception: %16"PRIx64"\n",
973 mc_global->mc_coreid, mc_global->mc_gstatus);
974 } else {
975 printk(XENLOG_WARNING "MCE: The hardware reports a non "
976 "fatal, correctable incident occurred on "
977 "CPU %d.\n",
978 mc_global->mc_coreid);
979 }
981 /* then the bank information */
982 x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
983 do {
984 if (mic == NULL)
985 return;
986 if (mic->type != MC_TYPE_BANK)
987 goto next;
989 mc_bank = (struct mcinfo_bank *)mic;
991 printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
992 mc_bank->mc_bank,
993 mc_bank->mc_status);
994 if (mc_bank->mc_status & MCi_STATUS_MISCV)
995 printk("[%16"PRIx64"]", mc_bank->mc_misc);
996 if (mc_bank->mc_status & MCi_STATUS_ADDRV)
997 printk(" at %16"PRIx64, mc_bank->mc_addr);
999 printk("\n");
1000 next:
1001 mic = x86_mcinfo_next(mic); /* next entry */
1002 if ((mic == NULL) || (mic->size == 0))
1003 break;
1004 } while (1);
1007 static void do_mc_get_cpu_info(void *v)
1009 int cpu = smp_processor_id();
1010 int cindex, cpn;
1011 struct cpuinfo_x86 *c;
1012 xen_mc_logical_cpu_t *log_cpus, *xcp;
1013 uint32_t junk, ebx;
1015 log_cpus = v;
1016 c = &cpu_data[cpu];
1017 cindex = 0;
1018 cpn = cpu - 1;
1020 /*
1021 * Deal with sparse masks, condensed into a contig array.
1022 */
1023 while (cpn >= 0) {
1024 if (cpu_isset(cpn, cpu_online_map))
1025 cindex++;
1026 cpn--;
1029 xcp = &log_cpus[cindex];
1030 c = &cpu_data[cpu];
1031 xcp->mc_cpunr = cpu;
1032 x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
1033 &xcp->mc_coreid, &xcp->mc_threadid,
1034 &xcp->mc_apicid, &xcp->mc_ncores,
1035 &xcp->mc_ncores_active, &xcp->mc_nthreads);
1036 xcp->mc_cpuid_level = c->cpuid_level;
1037 xcp->mc_family = c->x86;
1038 xcp->mc_vendor = c->x86_vendor;
1039 xcp->mc_model = c->x86_model;
1040 xcp->mc_step = c->x86_mask;
1041 xcp->mc_cache_size = c->x86_cache_size;
1042 xcp->mc_cache_alignment = c->x86_cache_alignment;
1043 memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
1044 memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
1045 memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
1047 /*
1048 * This part needs to run on the CPU itself.
1049 */
1050 xcp->mc_nmsrvals = __MC_NMSRS;
1051 xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
1052 rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
1054 if (c->cpuid_level >= 1) {
1055 cpuid(1, &junk, &ebx, &junk, &junk);
1056 xcp->mc_clusterid = (ebx >> 24) & 0xff;
1057 } else
1058 xcp->mc_clusterid = hard_smp_processor_id();
1062 void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
1063 uint16_t *threadid, uint32_t *apicid,
1064 unsigned *ncores, unsigned *ncores_active,
1065 unsigned *nthreads)
1067 struct cpuinfo_x86 *c;
1069 *apicid = cpu_physical_id(cpu);
1070 c = &cpu_data[cpu];
1071 if (c->apicid == BAD_APICID) {
1072 *chipid = cpu;
1073 *coreid = 0;
1074 *threadid = 0;
1075 if (ncores != NULL)
1076 *ncores = 1;
1077 if (ncores_active != NULL)
1078 *ncores_active = 1;
1079 if (nthreads != NULL)
1080 *nthreads = 1;
1081 } else {
1082 *chipid = phys_proc_id[cpu];
1083 if (c->x86_max_cores > 1)
1084 *coreid = cpu_core_id[cpu];
1085 else
1086 *coreid = 0;
1087 *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
1088 if (ncores != NULL)
1089 *ncores = c->x86_max_cores;
1090 if (ncores_active != NULL)
1091 *ncores_active = c->booted_cores;
1092 if (nthreads != NULL)
1093 *nthreads = c->x86_num_siblings;
1097 #define INTPOSE_NENT 50
1099 static struct intpose_ent {
1100 unsigned int cpu_nr;
1101 uint64_t msr;
1102 uint64_t val;
1103 } intpose_arr[INTPOSE_NENT];
1105 static void intpose_init(void)
1107 static int done;
1108 int i;
1110 if (done++ > 0)
1111 return;
1113 for (i = 0; i < INTPOSE_NENT; i++) {
1114 intpose_arr[i].cpu_nr = -1;
1119 struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
1120 uint64_t *valp)
1122 int i;
1124 for (i = 0; i < INTPOSE_NENT; i++) {
1125 if (intpose_arr[i].cpu_nr == cpu_nr &&
1126 intpose_arr[i].msr == msr) {
1127 if (valp != NULL)
1128 *valp = intpose_arr[i].val;
1129 return &intpose_arr[i];
1133 return NULL;
1136 static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
1138 struct intpose_ent *ent;
1139 int i;
1141 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1142 ent->val = val;
1143 return;
1146 for (i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++) {
1147 if (ent->cpu_nr == -1) {
1148 ent->cpu_nr = cpu_nr;
1149 ent->msr = msr;
1150 ent->val = val;
1151 return;
1155 printk("intpose_add: interpose array full - request dropped\n");
1158 void intpose_inval(unsigned int cpu_nr, uint64_t msr)
1160 struct intpose_ent *ent;
1162 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1163 ent->cpu_nr = -1;
1167 #define IS_MCA_BANKREG(r) \
1168 ((r) >= MSR_IA32_MC0_CTL && \
1169 (r) <= MSR_IA32_MC0_MISC + (nr_mce_banks - 1) * 4 && \
1170 ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
1172 int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain *d)
1174 int bank_nr;
1176 if ( !bank || !d || !h_mci_ctrl )
1177 return 1;
1179 /* Will MCE happen in host if If host mcg_ctl is 0? */
1180 if ( ~d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl )
1181 return 1;
1183 bank_nr = bank->mc_bank;
1184 if (~d->arch.vmca_msrs.mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] )
1185 return 1;
1186 return 0;
1189 static int x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
1191 struct cpuinfo_x86 *c;
1192 int i, errs = 0;
1194 c = &cpu_data[smp_processor_id()];
1196 for (i = 0; i < mci->mcinj_count; i++) {
1197 uint64_t reg = mci->mcinj_msr[i].reg;
1198 const char *reason = NULL;
1200 if (IS_MCA_BANKREG(reg)) {
1201 if (c->x86_vendor == X86_VENDOR_AMD) {
1202 /* On AMD we can set MCi_STATUS_WREN in the
1203 * HWCR MSR to allow non-zero writes to banks
1204 * MSRs not to #GP. The injector in dom0
1205 * should set that bit, but we detect when it
1206 * is necessary and set it as a courtesy to
1207 * avoid #GP in the hypervisor. */
1208 mci->mcinj_flags |=
1209 _MC_MSRINJ_F_REQ_HWCR_WREN;
1210 continue;
1211 } else {
1212 /* No alternative but to interpose, so require
1213 * that the injector specified as such. */
1214 if (!(mci->mcinj_flags &
1215 MC_MSRINJ_F_INTERPOSE)) {
1216 reason = "must specify interposition";
1219 } else {
1220 switch (reg) {
1221 /* MSRs acceptable on all x86 cpus */
1222 case MSR_IA32_MCG_STATUS:
1223 break;
1225 /* MSRs that the HV will take care of */
1226 case MSR_K8_HWCR:
1227 if (c->x86_vendor == X86_VENDOR_AMD)
1228 reason = "HV will operate HWCR";
1229 else
1230 reason ="only supported on AMD";
1231 break;
1233 default:
1234 reason = "not a recognized MCA MSR";
1235 break;
1239 if (reason != NULL) {
1240 printk("HV MSR INJECT ERROR: MSR 0x%llx %s\n",
1241 (unsigned long long)mci->mcinj_msr[i].reg, reason);
1242 errs++;
1246 return !errs;
1249 static uint64_t x86_mc_hwcr_wren(void)
1251 uint64_t old;
1253 rdmsrl(MSR_K8_HWCR, old);
1255 if (!(old & K8_HWCR_MCi_STATUS_WREN)) {
1256 uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
1257 wrmsrl(MSR_K8_HWCR, new);
1260 return old;
1263 static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
1265 if (!(hwcr & K8_HWCR_MCi_STATUS_WREN))
1266 wrmsrl(MSR_K8_HWCR, hwcr);
1269 static void x86_mc_msrinject(void *data)
1271 struct xen_mc_msrinject *mci = data;
1272 struct mcinfo_msr *msr;
1273 struct cpuinfo_x86 *c;
1274 uint64_t hwcr = 0;
1275 int intpose;
1276 int i;
1278 c = &cpu_data[smp_processor_id()];
1280 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1281 hwcr = x86_mc_hwcr_wren();
1283 intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
1285 for (i = 0, msr = &mci->mcinj_msr[0];
1286 i < mci->mcinj_count; i++, msr++) {
1287 printk("HV MSR INJECT (%s) target %u actual %u MSR 0x%llx "
1288 "<-- 0x%llx\n",
1289 intpose ? "interpose" : "hardware",
1290 mci->mcinj_cpunr, smp_processor_id(),
1291 (unsigned long long)msr->reg,
1292 (unsigned long long)msr->value);
1294 if (intpose)
1295 intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
1296 else
1297 wrmsrl(msr->reg, msr->value);
1300 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1301 x86_mc_hwcr_wren_restore(hwcr);
1304 /*ARGSUSED*/
1305 static void x86_mc_mceinject(void *data)
1307 printk("Simulating #MC on cpu %d\n", smp_processor_id());
1308 __asm__ __volatile__("int $0x12");
1311 #if BITS_PER_LONG == 64
1313 #define ID2COOKIE(id) ((mctelem_cookie_t)(id))
1314 #define COOKIE2ID(c) ((uint64_t)(c))
1316 #elif BITS_PER_LONG == 32
1318 #define ID2COOKIE(id) ((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU))
1319 #define COOKIE2ID(c) ((uint64_t)(uint32_t)(c))
1321 #elif defined(BITS_PER_LONG)
1322 #error BITS_PER_LONG has unexpected value
1323 #else
1324 #error BITS_PER_LONG definition absent
1325 #endif
1327 #ifdef CONFIG_COMPAT
1328 # include <compat/arch-x86/xen-mca.h>
1330 # define xen_mcinfo_msr mcinfo_msr
1331 CHECK_mcinfo_msr;
1332 # undef xen_mcinfo_msr
1333 # undef CHECK_mcinfo_msr
1334 # define CHECK_mcinfo_msr struct mcinfo_msr
1336 # define xen_mcinfo_common mcinfo_common
1337 CHECK_mcinfo_common;
1338 # undef xen_mcinfo_common
1339 # undef CHECK_mcinfo_common
1340 # define CHECK_mcinfo_common struct mcinfo_common
1342 CHECK_FIELD_(struct, mc_fetch, flags);
1343 CHECK_FIELD_(struct, mc_fetch, fetch_id);
1344 # define CHECK_compat_mc_fetch struct mc_fetch
1346 CHECK_FIELD_(struct, mc_physcpuinfo, ncpus);
1347 # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo
1349 CHECK_mc;
1350 # undef CHECK_compat_mc_fetch
1351 # undef CHECK_compat_mc_physcpuinfo
1353 # define xen_mc_info mc_info
1354 CHECK_mc_info;
1355 # undef xen_mc_info
1357 # define xen_mcinfo_global mcinfo_global
1358 CHECK_mcinfo_global;
1359 # undef xen_mcinfo_global
1361 # define xen_mcinfo_bank mcinfo_bank
1362 CHECK_mcinfo_bank;
1363 # undef xen_mcinfo_bank
1365 # define xen_mcinfo_extended mcinfo_extended
1366 CHECK_mcinfo_extended;
1367 # undef xen_mcinfo_extended
1369 # define xen_mcinfo_recovery mcinfo_recovery
1370 # define xen_cpu_offline_action cpu_offline_action
1371 # define xen_page_offline_action page_offline_action
1372 CHECK_mcinfo_recovery;
1373 # undef xen_cpu_offline_action
1374 # undef xen_page_offline_action
1375 # undef xen_mcinfo_recovery
1376 #else
1377 # define compat_mc_fetch xen_mc_fetch
1378 # define compat_mc_physcpuinfo xen_mc_physcpuinfo
1379 # define compat_handle_is_null guest_handle_is_null
1380 # define copy_to_compat copy_to_guest
1381 #endif
1383 /* Machine Check Architecture Hypercall */
1384 long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
1386 long ret = 0;
1387 struct xen_mc curop, *op = &curop;
1388 struct vcpu *v = current;
1389 union {
1390 struct xen_mc_fetch *nat;
1391 struct compat_mc_fetch *cmp;
1392 } mc_fetch;
1393 union {
1394 struct xen_mc_physcpuinfo *nat;
1395 struct compat_mc_physcpuinfo *cmp;
1396 } mc_physcpuinfo;
1397 uint32_t flags, cmdflags;
1398 int nlcpu;
1399 xen_mc_logical_cpu_t *log_cpus = NULL;
1400 mctelem_cookie_t mctc;
1401 mctelem_class_t which;
1402 unsigned int target;
1403 struct xen_mc_msrinject *mc_msrinject;
1404 struct xen_mc_mceinject *mc_mceinject;
1406 if (!IS_PRIV(v->domain) )
1407 return x86_mcerr(NULL, -EPERM);
1409 if ( copy_from_guest(op, u_xen_mc, 1) )
1410 return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
1412 if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
1413 return x86_mcerr("do_mca: interface version mismatch", -EACCES);
1415 switch (op->cmd) {
1416 case XEN_MC_fetch:
1417 mc_fetch.nat = &op->u.mc_fetch;
1418 cmdflags = mc_fetch.nat->flags;
1420 switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
1421 case XEN_MC_NONURGENT:
1422 which = MC_NONURGENT;
1423 break;
1425 case XEN_MC_URGENT:
1426 which = MC_URGENT;
1427 break;
1429 default:
1430 return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
1433 flags = XEN_MC_OK;
1435 if (cmdflags & XEN_MC_ACK) {
1436 mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
1437 mctelem_ack(which, cookie);
1438 } else {
1439 if (!is_pv_32on64_vcpu(v)
1440 ? guest_handle_is_null(mc_fetch.nat->data)
1441 : compat_handle_is_null(mc_fetch.cmp->data))
1442 return x86_mcerr("do_mca fetch: guest buffer "
1443 "invalid", -EINVAL);
1445 if ((mctc = mctelem_consume_oldest_begin(which))) {
1446 struct mc_info *mcip = mctelem_dataptr(mctc);
1447 if (!is_pv_32on64_vcpu(v)
1448 ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
1449 : copy_to_compat(mc_fetch.cmp->data,
1450 mcip, 1)) {
1451 ret = -EFAULT;
1452 flags |= XEN_MC_FETCHFAILED;
1453 mc_fetch.nat->fetch_id = 0;
1454 } else {
1455 mc_fetch.nat->fetch_id = COOKIE2ID(mctc);
1457 mctelem_consume_oldest_end(mctc);
1458 } else {
1459 /* There is no data */
1460 flags |= XEN_MC_NODATA;
1461 mc_fetch.nat->fetch_id = 0;
1464 mc_fetch.nat->flags = flags;
1465 if (copy_to_guest(u_xen_mc, op, 1) != 0)
1466 ret = -EFAULT;
1469 break;
1471 case XEN_MC_notifydomain:
1472 return x86_mcerr("do_mca notify unsupported", -EINVAL);
1474 case XEN_MC_physcpuinfo:
1475 mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
1476 nlcpu = num_online_cpus();
1478 if (!is_pv_32on64_vcpu(v)
1479 ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
1480 : !compat_handle_is_null(mc_physcpuinfo.cmp->info)) {
1481 if (mc_physcpuinfo.nat->ncpus <= 0)
1482 return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
1483 -EINVAL);
1484 nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus);
1485 log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
1486 if (log_cpus == NULL)
1487 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
1489 if (on_each_cpu(do_mc_get_cpu_info, log_cpus, 1)) {
1490 xfree(log_cpus);
1491 return x86_mcerr("do_mca cpuinfo", -EIO);
1493 if (!is_pv_32on64_vcpu(v)
1494 ? copy_to_guest(mc_physcpuinfo.nat->info,
1495 log_cpus, nlcpu)
1496 : copy_to_compat(mc_physcpuinfo.cmp->info,
1497 log_cpus, nlcpu))
1498 ret = -EFAULT;
1499 xfree(log_cpus);
1502 mc_physcpuinfo.nat->ncpus = nlcpu;
1504 if (copy_to_guest(u_xen_mc, op, 1))
1505 return x86_mcerr("do_mca cpuinfo", -EFAULT);
1507 break;
1509 case XEN_MC_msrinject:
1510 if (nr_mce_banks == 0)
1511 return x86_mcerr("do_mca inject", -ENODEV);
1513 mc_msrinject = &op->u.mc_msrinject;
1514 target = mc_msrinject->mcinj_cpunr;
1516 if (target >= NR_CPUS)
1517 return x86_mcerr("do_mca inject: bad target", -EINVAL);
1519 if (!cpu_isset(target, cpu_online_map))
1520 return x86_mcerr("do_mca inject: target offline",
1521 -EINVAL);
1523 if (mc_msrinject->mcinj_count == 0)
1524 return 0;
1526 if (!x86_mc_msrinject_verify(mc_msrinject))
1527 return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
1529 add_taint(TAINT_ERROR_INJECT);
1531 on_selected_cpus(cpumask_of(target), x86_mc_msrinject,
1532 mc_msrinject, 1);
1534 break;
1536 case XEN_MC_mceinject:
1537 if (nr_mce_banks == 0)
1538 return x86_mcerr("do_mca #MC", -ENODEV);
1540 mc_mceinject = &op->u.mc_mceinject;
1541 target = mc_mceinject->mceinj_cpunr;
1543 if (target >= NR_CPUS)
1544 return x86_mcerr("do_mca #MC: bad target", -EINVAL);
1546 if (!cpu_isset(target, cpu_online_map))
1547 return x86_mcerr("do_mca #MC: target offline", -EINVAL);
1549 add_taint(TAINT_ERROR_INJECT);
1551 on_selected_cpus(cpumask_of(target), x86_mc_mceinject,
1552 mc_mceinject, 1);
1553 break;
1555 default:
1556 return x86_mcerr("do_mca: bad command", -EINVAL);
1559 return ret;
1561 void set_poll_bankmask(struct cpuinfo_x86 *c)
1564 if (cmci_support && !mce_disabled) {
1565 memcpy(&(__get_cpu_var(poll_bankmask)),
1566 &(__get_cpu_var(no_cmci_banks)), sizeof(cpu_banks_t));
1568 else {
1569 memcpy(&(get_cpu_var(poll_bankmask)), &mca_allbanks, sizeof(cpu_banks_t));
1570 if (mce_firstbank(c))
1571 clear_bit(0, get_cpu_var(poll_bankmask));
1574 void mc_panic(char *s)
1576 is_mc_panic = 1;
1577 console_start_sync();
1578 printk("Fatal machine check: %s\n", s);
1579 printk("\n"
1580 "****************************************\n"
1581 "\n"
1582 " The processor has reported a hardware error which cannot\n"
1583 " be recovered from. Xen will now reboot the machine.\n");
1584 panic("HARDWARE ERROR");