debuggers.hg

view xen/arch/x86/cpu/mcheck/mce.c @ 20963:da7ae6d8838a

x86: MCE fixes

- fill_vmsr_data() leaked a domain reference; since the caller already
obtained one, there's no need to obtain another one here
- intel_UCR_handler() could call put_domain() with a NULL pointer
- mcheck_mca_logout() updated a local data structure that wasn't used
after the update

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 10 09:18:11 2010 +0000 (2010-02-10)
parents ebd2495ec073
children 50ea24db1f88
line source
1 /*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4 */
6 #include <xen/init.h>
7 #include <xen/types.h>
8 #include <xen/kernel.h>
9 #include <xen/config.h>
10 #include <xen/smp.h>
11 #include <xen/errno.h>
12 #include <xen/console.h>
13 #include <xen/sched.h>
14 #include <xen/sched-if.h>
15 #include <xen/cpumask.h>
16 #include <xen/event.h>
17 #include <xen/guest_access.h>
18 #include <xen/hypercall.h> /* for do_mca */
20 #include <asm/processor.h>
21 #include <asm/system.h>
22 #include <asm/msr.h>
24 #include "mce.h"
26 int mce_disabled;
27 invbool_param("mce", mce_disabled);
28 static int mce_force_broadcast;
29 boolean_param("mce_fb", mce_force_broadcast);
30 int is_mc_panic;
31 unsigned int nr_mce_banks;
33 int mce_broadcast = 0;
34 static uint64_t g_mcg_cap;
36 /* Real value in physical CTL MSR */
37 static uint64_t h_mcg_ctl = 0UL;
38 static uint64_t *h_mci_ctrl;
39 int firstbank;
41 static void intpose_init(void);
42 static void mcinfo_clear(struct mc_info *);
44 #define SEG_PL(segsel) ((segsel) & 0x3)
45 #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16)
47 #if 0
48 static int x86_mcerr(const char *msg, int err)
49 {
50 gdprintk(XENLOG_WARNING, "x86_mcerr: %s, returning %d\n",
51 msg != NULL ? msg : "", err);
52 return err;
53 }
54 #else
55 #define x86_mcerr(msg, err) (err)
56 #endif
58 cpu_banks_t mca_allbanks;
60 int mce_verbosity;
61 static void __init mce_set_verbosity(char *str)
62 {
63 if (strcmp("verbose", str) == 0)
64 mce_verbosity = MCE_VERBOSE;
65 else
66 printk(KERN_DEBUG "Machine Check verbosity level %s not recognised"
67 "use mce_verbosity=verbose", str);
68 }
69 custom_param("mce_verbosity", mce_set_verbosity);
71 /* Handle unconfigured int18 (should never happen) */
72 static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
73 {
74 printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
75 smp_processor_id());
76 }
79 static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
81 void x86_mce_vector_register(x86_mce_vector_t hdlr)
82 {
83 _machine_check_vector = hdlr;
84 wmb();
85 }
87 /* Call the installed machine check handler for this CPU setup. */
89 void machine_check_vector(struct cpu_user_regs *regs, long error_code)
90 {
91 _machine_check_vector(regs, error_code);
92 }
94 /* Init machine check callback handler
95 * It is used to collect additional information provided by newer
96 * CPU families/models without the need to duplicate the whole handler.
97 * This avoids having many handlers doing almost nearly the same and each
98 * with its own tweaks ands bugs. */
99 static x86_mce_callback_t mc_callback_bank_extended = NULL;
101 void x86_mce_callback_register(x86_mce_callback_t cbfunc)
102 {
103 mc_callback_bank_extended = cbfunc;
104 }
106 /* Machine check recoverable judgement callback handler
107 * It is used to judge whether an UC error is recoverable by software
108 */
109 static mce_recoverable_t mc_recoverable_scan = NULL;
111 void mce_recoverable_register(mce_recoverable_t cbfunc)
112 {
113 mc_recoverable_scan = cbfunc;
114 }
116 /* Judging whether to Clear Machine Check error bank callback handler
117 * According to Intel latest MCA OS Recovery Writer's Guide,
118 * whether the error MCA bank needs to be cleared is decided by the mca_source
119 * and MCi_status bit value.
120 */
121 static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
123 void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
124 {
125 mc_need_clearbank_scan = cbfunc;
126 }
128 /* Utility function to perform MCA bank telemetry readout and to push that
129 * telemetry towards an interested dom0 for logging and diagnosis.
130 * The caller - #MC handler or MCA poll function - must arrange that we
131 * do not migrate cpus. */
133 /* XXFM Could add overflow counting? */
135 /* Add out_param clear_bank for Machine Check Handler Caller.
136 * For Intel latest CPU, whether to clear the error bank status needs to
137 * be judged by the callback function defined above.
138 */
139 mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
140 struct mca_summary *sp, cpu_banks_t* clear_bank)
141 {
142 struct vcpu *v = current;
143 struct domain *d;
144 uint64_t gstatus, status, addr, misc;
145 struct mcinfo_global mcg; /* on stack */
146 struct mcinfo_common *mic;
147 struct mcinfo_global *mig; /* on stack */
148 mctelem_cookie_t mctc = NULL;
149 uint32_t uc = 0, pcc = 0, recover, need_clear = 1 ;
150 struct mc_info *mci = NULL;
151 mctelem_class_t which = MC_URGENT; /* XXXgcc */
152 unsigned int cpu_nr;
153 int errcnt = 0;
154 int i;
155 enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
157 cpu_nr = smp_processor_id();
158 BUG_ON(cpu_nr != v->processor);
160 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
162 memset(&mcg, 0, sizeof (mcg));
163 mcg.common.type = MC_TYPE_GLOBAL;
164 mcg.common.size = sizeof (mcg);
165 if (v != NULL && ((d = v->domain) != NULL)) {
166 mcg.mc_domid = d->domain_id;
167 mcg.mc_vcpuid = v->vcpu_id;
168 } else {
169 mcg.mc_domid = -1;
170 mcg.mc_vcpuid = -1;
171 }
172 mcg.mc_gstatus = gstatus; /* MCG_STATUS */
174 switch (who) {
175 case MCA_MCE_HANDLER:
176 case MCA_MCE_SCAN:
177 mcg.mc_flags = MC_FLAG_MCE;
178 which = MC_URGENT;
179 break;
181 case MCA_POLLER:
182 case MCA_RESET:
183 mcg.mc_flags = MC_FLAG_POLLED;
184 which = MC_NONURGENT;
185 break;
187 case MCA_CMCI_HANDLER:
188 mcg.mc_flags = MC_FLAG_CMCI;
189 which = MC_NONURGENT;
190 break;
192 default:
193 BUG();
194 }
196 /* Retrieve detector information */
197 x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
198 &mcg.mc_coreid, &mcg.mc_core_threadid,
199 &mcg.mc_apicid, NULL, NULL, NULL);
201 /* If no mc_recovery_scan callback handler registered,
202 * this error is not recoverable
203 */
204 recover = (mc_recoverable_scan)? 1: 0;
206 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
207 struct mcinfo_bank mcb; /* on stack */
209 /* Skip bank if corresponding bit in bankmask is clear */
210 if (!test_bit(i, bankmask))
211 continue;
213 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
214 if (!(status & MCi_STATUS_VAL))
215 continue; /* this bank has no valid telemetry */
217 /* For Intel Latest CPU CMCI/MCE Handler caller, we need to
218 * decide whether to clear bank by MCi_STATUS bit value such as
219 * OVER/UC/EN/PCC/S/AR
220 */
221 if ( mc_need_clearbank_scan )
222 need_clear = mc_need_clearbank_scan(who, status);
224 /* If this is the first bank with valid MCA DATA, then
225 * try to reserve an entry from the urgent/nonurgent queue
226 * depending on whethere we are called from an exception or
227 * a poller; this can fail (for example dom0 may not
228 * yet have consumed past telemetry). */
229 if (errcnt == 0) {
230 if ((mctc = mctelem_reserve(which)) != NULL) {
231 mci = mctelem_dataptr(mctc);
232 mcinfo_clear(mci);
233 }
234 }
236 memset(&mcb, 0, sizeof (mcb));
237 mcb.common.type = MC_TYPE_BANK;
238 mcb.common.size = sizeof (mcb);
239 mcb.mc_bank = i;
240 mcb.mc_status = status;
242 /* form a mask of which banks have logged uncorrected errors */
243 if ((status & MCi_STATUS_UC) != 0)
244 uc |= (1 << i);
246 /* likewise for those with processor context corrupt */
247 if ((status & MCi_STATUS_PCC) != 0)
248 pcc |= (1 << i);
250 if (recover && uc)
251 /* uc = 1, recover = 1, we need not panic.
252 */
253 recover = mc_recoverable_scan(status);
255 addr = misc = 0;
257 if (status & MCi_STATUS_ADDRV) {
258 mca_rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
259 if (mfn_valid(paddr_to_pfn(addr))) {
260 d = maddr_get_owner(addr);
261 if (d != NULL && (who == MCA_POLLER ||
262 who == MCA_CMCI_HANDLER))
263 mcb.mc_domid = d->domain_id;
264 }
265 }
267 if (status & MCi_STATUS_MISCV)
268 mca_rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
270 mcb.mc_addr = addr;
271 mcb.mc_misc = misc;
273 if (who == MCA_CMCI_HANDLER) {
274 mca_rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
275 rdtscll(mcb.mc_tsc);
276 }
278 /* Increment the error count; if this is the first bank
279 * with a valid error then add the global info to the mcinfo. */
280 if (errcnt++ == 0 && mci != NULL)
281 x86_mcinfo_add(mci, &mcg);
283 /* Add the bank data */
284 if (mci != NULL)
285 x86_mcinfo_add(mci, &mcb);
287 if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
288 cbret = mc_callback_bank_extended(mci, i, status);
289 }
291 /* By default, need_clear = 1 */
292 if (who != MCA_MCE_SCAN && need_clear)
293 /* Clear status */
294 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
295 else if ( who == MCA_MCE_SCAN && need_clear)
296 set_bit(i, clear_bank);
298 wmb();
299 }
301 if (mci != NULL && errcnt > 0) {
302 x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
303 mig = container_of(mic, struct mcinfo_global, common);
304 if (mic == NULL)
305 ;
306 else if (pcc)
307 mig->mc_flags |= MC_FLAG_UNCORRECTABLE;
308 else if (uc)
309 mig->mc_flags |= MC_FLAG_RECOVERABLE;
310 else
311 mig->mc_flags |= MC_FLAG_CORRECTABLE;
312 }
315 if (sp) {
316 sp->errcnt = errcnt;
317 sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
318 sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
319 sp->uc = uc;
320 sp->pcc = pcc;
321 sp->recoverable = recover;
322 }
324 return mci != NULL ? mctc : NULL; /* may be NULL */
325 }
327 #define DOM_NORMAL 0
328 #define DOM0_TRAP 1
329 #define DOMU_TRAP 2
330 #define DOMU_KILLED 4
332 /* Shared #MC handler. */
333 void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
334 cpu_banks_t bankmask)
335 {
336 int xen_state_lost, dom0_state_lost, domU_state_lost;
337 struct vcpu *v = current;
338 struct domain *curdom = v->domain;
339 domid_t domid = curdom->domain_id;
340 int ctx_xen, ctx_dom0, ctx_domU;
341 uint32_t dom_state = DOM_NORMAL;
342 mctelem_cookie_t mctc = NULL;
343 struct mca_summary bs;
344 struct mc_info *mci = NULL;
345 int irqlocked = 0;
346 uint64_t gstatus;
347 int ripv;
349 /* This handler runs as interrupt gate. So IPIs from the
350 * polling service routine are defered until we're finished.
351 */
353 /* Disable interrupts for the _vcpu_. It may not re-scheduled to
354 * another physical CPU. */
355 vcpu_schedule_lock_irq(v);
356 irqlocked = 1;
358 /* Read global status; if it does not indicate machine check
359 * in progress then bail as long as we have a valid ip to return to. */
360 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
361 ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
362 if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
363 add_taint(TAINT_MACHINE_CHECK); /* questionable */
364 vcpu_schedule_unlock_irq(v);
365 irqlocked = 0;
366 goto cmn_handler_done;
367 }
369 /* Go and grab error telemetry. We must choose whether to commit
370 * for logging or dismiss the cookie that is returned, and must not
371 * reference the cookie after that action.
372 */
373 mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
374 if (mctc != NULL)
375 mci = (struct mc_info *)mctelem_dataptr(mctc);
377 /* Clear MCIP or another #MC will enter shutdown state */
378 gstatus &= ~MCG_STATUS_MCIP;
379 mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
380 wmb();
382 /* If no valid errors and our stack is intact, we're done */
383 if (ripv && bs.errcnt == 0) {
384 vcpu_schedule_unlock_irq(v);
385 irqlocked = 0;
386 goto cmn_handler_done;
387 }
389 if (bs.uc || bs.pcc)
390 add_taint(TAINT_MACHINE_CHECK);
392 /* Machine check exceptions will usually be for UC and/or PCC errors,
393 * but it is possible to configure machine check for some classes
394 * of corrected error.
395 *
396 * UC errors could compromise any domain or the hypervisor
397 * itself - for example a cache writeback of modified data that
398 * turned out to be bad could be for data belonging to anyone, not
399 * just the current domain. In the absence of known data poisoning
400 * to prevent consumption of such bad data in the system we regard
401 * all UC errors as terminal. It may be possible to attempt some
402 * heuristics based on the address affected, which guests have
403 * mappings to that mfn etc.
404 *
405 * PCC errors apply to the current context.
406 *
407 * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
408 * and not PCC is terminal - the return instruction pointer
409 * pushed onto the stack is bogus. If the interrupt context is
410 * the hypervisor or dom0 the game is over, otherwise we can
411 * limit the impact to a single domU but only if we trampoline
412 * somewhere safely - we can't return and unwind the stack.
413 * Since there is no trampoline in place we will treat !RIPV
414 * as terminal for any context.
415 */
416 ctx_xen = SEG_PL(regs->cs) == 0;
417 ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
418 ctx_domU = !ctx_xen && !ctx_dom0;
420 xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
421 !ripv;
422 dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
423 domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
425 if (xen_state_lost) {
426 /* Now we are going to panic anyway. Allow interrupts, so that
427 * printk on serial console can work. */
428 vcpu_schedule_unlock_irq(v);
429 irqlocked = 0;
431 printk("Terminal machine check exception occurred in "
432 "hypervisor context.\n");
434 /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
435 * to the error then it makes sense to print a stack trace.
436 * That can be useful for more detailed error analysis and/or
437 * error case studies to figure out, if we can clear
438 * xen_impacted and kill a DomU instead
439 * (i.e. if a guest only control structure is affected, but then
440 * we must ensure the bad pages are not re-used again).
441 */
442 if (bs.eipv & MCG_STATUS_EIPV) {
443 printk("MCE: Instruction Pointer is related to the "
444 "error, therefore print the execution state.\n");
445 show_execution_state(regs);
446 }
448 /* Commit the telemetry so that panic flow can find it. */
449 if (mctc != NULL) {
450 x86_mcinfo_dump(mci);
451 mctelem_commit(mctc);
452 }
453 mc_panic("Hypervisor state lost due to machine check "
454 "exception.\n");
455 /*NOTREACHED*/
456 }
458 /*
459 * Xen hypervisor state is intact. If dom0 state is lost then
460 * give it a chance to decide what to do if it has registered
461 * a handler for this event, otherwise panic.
462 *
463 * XXFM Could add some Solaris dom0 contract kill here?
464 */
465 if (dom0_state_lost) {
466 if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
467 dom_state = DOM0_TRAP;
468 send_guest_trap(dom0, 0, TRAP_machine_check);
469 /* XXFM case of return with !ripv ??? */
470 } else {
471 /* Commit telemetry for panic flow. */
472 if (mctc != NULL) {
473 x86_mcinfo_dump(mci);
474 mctelem_commit(mctc);
475 }
476 mc_panic("Dom0 state lost due to machine check "
477 "exception\n");
478 /*NOTREACHED*/
479 }
480 }
482 /*
483 * If a domU has lost state then send it a trap if it has registered
484 * a handler, otherwise crash the domain.
485 * XXFM Revisit this functionality.
486 */
487 if (domU_state_lost) {
488 if (guest_has_trap_callback(v->domain, v->vcpu_id,
489 TRAP_machine_check)) {
490 dom_state = DOMU_TRAP;
491 send_guest_trap(curdom, v->vcpu_id,
492 TRAP_machine_check);
493 } else {
494 dom_state = DOMU_KILLED;
495 /* Enable interrupts. This basically results in
496 * calling sti on the *physical* cpu. But after
497 * domain_crash() the vcpu pointer is invalid.
498 * Therefore, we must unlock the irqs before killing
499 * it. */
500 vcpu_schedule_unlock_irq(v);
501 irqlocked = 0;
503 /* DomU is impacted. Kill it and continue. */
504 domain_crash(curdom);
505 }
506 }
508 switch (dom_state) {
509 case DOM0_TRAP:
510 case DOMU_TRAP:
511 /* Enable interrupts. */
512 vcpu_schedule_unlock_irq(v);
513 irqlocked = 0;
515 /* guest softirqs and event callbacks are scheduled
516 * immediately after this handler exits. */
517 break;
518 case DOMU_KILLED:
519 /* Nothing to do here. */
520 break;
522 case DOM_NORMAL:
523 vcpu_schedule_unlock_irq(v);
524 irqlocked = 0;
525 break;
526 }
528 cmn_handler_done:
529 BUG_ON(irqlocked);
530 BUG_ON(!ripv);
532 if (bs.errcnt) {
533 /* Not panicing, so forward telemetry to dom0 now if it
534 * is interested. */
535 if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
536 if (mctc != NULL)
537 mctelem_commit(mctc);
538 send_guest_global_virq(dom0, VIRQ_MCA);
539 } else {
540 x86_mcinfo_dump(mci);
541 if (mctc != NULL)
542 mctelem_dismiss(mctc);
543 }
544 } else if (mctc != NULL) {
545 mctelem_dismiss(mctc);
546 }
547 }
549 void mcheck_mca_clearbanks(cpu_banks_t bankmask)
550 {
551 int i;
552 uint64_t status;
554 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
555 if (!test_bit(i, bankmask))
556 continue;
557 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
558 if (!(status & MCi_STATUS_VAL))
559 continue;
560 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
561 }
562 }
564 static int amd_mcheck_init(struct cpuinfo_x86 *ci)
565 {
566 int rc = 0;
568 switch (ci->x86) {
569 case 6:
570 rc = amd_k7_mcheck_init(ci);
571 break;
573 default:
574 /* Assume that machine check support is available.
575 * The minimum provided support is at least the K8. */
576 case 0xf:
577 rc = amd_k8_mcheck_init(ci);
578 break;
580 case 0x10:
581 case 0x11:
582 rc = amd_f10_mcheck_init(ci);
583 break;
584 }
586 return rc;
587 }
589 /*check the existence of Machine Check*/
590 int mce_available(struct cpuinfo_x86 *c)
591 {
592 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
593 }
595 static int mce_is_broadcast(struct cpuinfo_x86 *c)
596 {
597 if (mce_force_broadcast)
598 return 1;
600 /* According to Intel SDM Dec, 2009, 15.10.4.1, For processors with
601 * DisplayFamily_DisplayModel encoding of 06H_EH and above,
602 * a MCA signal is broadcast to all logical processors in the system
603 */
604 if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
605 c->x86_model >= 0xe)
606 return 1;
607 return 0;
608 }
610 /*
611 * Check if bank 0 is usable for MCE. It isn't for AMD K7,
612 * and Intel P6 family before model 0x1a.
613 */
614 int mce_firstbank(struct cpuinfo_x86 *c)
615 {
616 if (c->x86 == 6) {
617 if (c->x86_vendor == X86_VENDOR_AMD)
618 return 1;
620 if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
621 return 1;
622 }
624 return 0;
625 }
627 /* This has to be run for each processor */
628 void mcheck_init(struct cpuinfo_x86 *c)
629 {
630 int inited = 0, i, broadcast;
631 static int broadcast_check;
633 if (mce_disabled == 1) {
634 dprintk(XENLOG_INFO, "MCE support disabled by bootparam\n");
635 return;
636 }
638 broadcast = mce_is_broadcast(c);
639 if (broadcast_check && (broadcast != mce_broadcast) )
640 dprintk(XENLOG_INFO,
641 "CPUs have mixed broadcast support"
642 "may cause undetermined result!!!\n");
644 broadcast_check = 1;
645 if (broadcast)
646 mce_broadcast = broadcast;
648 for (i = 0; i < MAX_NR_BANKS; i++)
649 set_bit(i,mca_allbanks);
651 /* Enforce at least MCE support in CPUID information. Individual
652 * families may also need to enforce a check for MCA support. */
653 if (!cpu_has(c, X86_FEATURE_MCE)) {
654 printk(XENLOG_INFO "CPU%i: No machine check support available\n",
655 smp_processor_id());
656 return;
657 }
659 intpose_init();
660 mctelem_init(sizeof (struct mc_info));
662 switch (c->x86_vendor) {
663 case X86_VENDOR_AMD:
664 inited = amd_mcheck_init(c);
665 break;
667 case X86_VENDOR_INTEL:
668 switch (c->x86) {
669 case 6:
670 case 15:
671 inited = intel_mcheck_init(c);
672 break;
673 }
674 break;
676 default:
677 break;
678 }
680 if ( !h_mci_ctrl )
681 {
682 h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks);
683 if (!h_mci_ctrl)
684 {
685 dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n");
686 return;
687 }
688 /* Don't care banks before firstbank */
689 memset(h_mci_ctrl, 0xff, sizeof(h_mci_ctrl));
690 for (i = firstbank; i < nr_mce_banks; i++)
691 rdmsrl(MSR_IA32_MC0_CTL + 4*i, h_mci_ctrl[i]);
692 }
693 if (g_mcg_cap & MCG_CTL_P)
694 rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl);
695 set_poll_bankmask(c);
696 if (!inited)
697 printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
698 smp_processor_id());
699 }
701 u64 mce_cap_init(void)
702 {
703 u32 l, h;
704 u64 value;
706 rdmsr(MSR_IA32_MCG_CAP, l, h);
707 value = ((u64)h << 32) | l;
708 /* For Guest vMCE usage */
709 g_mcg_cap = value & ~MCG_CMCI_P;
711 if (l & MCG_CTL_P) /* Control register present ? */
712 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
714 nr_mce_banks = l & MCG_CAP_COUNT;
715 if ( nr_mce_banks > MAX_NR_BANKS )
716 {
717 printk(KERN_WARNING "MCE: exceed max mce banks\n");
718 g_mcg_cap = (g_mcg_cap & ~MCG_CAP_COUNT) | MAX_NR_BANKS;
719 }
721 return value;
722 }
724 /* Guest vMCE# MSRs virtualization ops (rdmsr/wrmsr) */
725 void mce_init_msr(struct domain *d)
726 {
727 d->arch.vmca_msrs.mcg_status = 0x0;
728 d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
729 d->arch.vmca_msrs.mcg_ctl = ~(uint64_t)0x0;
730 d->arch.vmca_msrs.nr_injection = 0;
731 memset(d->arch.vmca_msrs.mci_ctl, ~0,
732 sizeof(d->arch.vmca_msrs.mci_ctl));
733 INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
734 spin_lock_init(&d->arch.vmca_msrs.lock);
735 }
737 int mce_rdmsr(uint32_t msr, uint64_t *val)
738 {
739 struct domain *d = current->domain;
740 int ret = 1;
741 unsigned int bank;
742 struct bank_entry *entry = NULL;
744 *val = 0;
745 spin_lock(&d->arch.vmca_msrs.lock);
747 switch ( msr )
748 {
749 case MSR_IA32_MCG_STATUS:
750 *val = d->arch.vmca_msrs.mcg_status;
751 if (*val)
752 mce_printk(MCE_VERBOSE,
753 "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
754 break;
755 case MSR_IA32_MCG_CAP:
756 *val = d->arch.vmca_msrs.mcg_cap;
757 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
758 *val);
759 break;
760 case MSR_IA32_MCG_CTL:
761 /* Always 0 if no CTL support */
762 *val = d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl;
763 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
764 *val);
765 break;
766 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
767 bank = (msr - MSR_IA32_MC0_CTL) / 4;
768 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
769 {
770 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
771 ret = 0;
772 break;
773 }
774 switch (msr & (MSR_IA32_MC0_CTL | 3))
775 {
776 case MSR_IA32_MC0_CTL:
777 *val = d->arch.vmca_msrs.mci_ctl[bank] &
778 (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
779 mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
780 bank, *val);
781 break;
782 case MSR_IA32_MC0_STATUS:
783 /* Only error bank is read. Non-error banks simply return. */
784 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
785 {
786 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
787 struct bank_entry, list);
788 if (entry->bank == bank) {
789 *val = entry->mci_status;
790 mce_printk(MCE_VERBOSE,
791 "MCE: rd MC%u_STATUS in vMCE# context "
792 "value 0x%"PRIx64"\n", bank, *val);
793 }
794 else
795 entry = NULL;
796 }
797 break;
798 case MSR_IA32_MC0_ADDR:
799 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
800 {
801 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
802 struct bank_entry, list);
803 if ( entry->bank == bank )
804 {
805 *val = entry->mci_addr;
806 mce_printk(MCE_VERBOSE,
807 "MCE: rdmsr MC%u_ADDR in vMCE# context "
808 "0x%"PRIx64"\n", bank, *val);
809 }
810 }
811 break;
812 case MSR_IA32_MC0_MISC:
813 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
814 {
815 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
816 struct bank_entry, list);
817 if ( entry->bank == bank )
818 {
819 *val = entry->mci_misc;
820 mce_printk(MCE_VERBOSE,
821 "MCE: rd MC%u_MISC in vMCE# context "
822 "0x%"PRIx64"\n", bank, *val);
823 }
824 }
825 break;
826 }
827 break;
828 default:
829 switch ( boot_cpu_data.x86_vendor )
830 {
831 case X86_VENDOR_INTEL:
832 ret = intel_mce_rdmsr(msr, val);
833 break;
834 default:
835 ret = 0;
836 break;
837 }
838 break;
839 }
841 spin_unlock(&d->arch.vmca_msrs.lock);
842 return ret;
843 }
845 int mce_wrmsr(u32 msr, u64 val)
846 {
847 struct domain *d = current->domain;
848 struct bank_entry *entry = NULL;
849 unsigned int bank;
850 int ret = 1;
852 if ( !g_mcg_cap )
853 return 0;
855 spin_lock(&d->arch.vmca_msrs.lock);
857 switch ( msr )
858 {
859 case MSR_IA32_MCG_CTL:
860 d->arch.vmca_msrs.mcg_ctl = val;
861 break;
862 case MSR_IA32_MCG_STATUS:
863 d->arch.vmca_msrs.mcg_status = val;
864 mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
865 /* For HVM guest, this is the point for deleting vMCE injection node */
866 if ( d->is_hvm && (d->arch.vmca_msrs.nr_injection > 0) )
867 {
868 d->arch.vmca_msrs.nr_injection--; /* Should be 0 */
869 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
870 {
871 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
872 struct bank_entry, list);
873 if ( entry->mci_status & MCi_STATUS_VAL )
874 mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have "
875 "been cleared before write MCG_STATUS MSR\n");
877 mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
878 "Node, nr_injection %u\n",
879 d->arch.vmca_msrs.nr_injection);
880 list_del(&entry->list);
881 xfree(entry);
882 }
883 else
884 mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
885 " last injection Node, something Wrong!\n");
886 }
887 break;
888 case MSR_IA32_MCG_CAP:
889 mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
890 ret = -1;
891 break;
892 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
893 bank = (msr - MSR_IA32_MC0_CTL) / 4;
894 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
895 {
896 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
897 ret = 0;
898 break;
899 }
900 switch ( msr & (MSR_IA32_MC0_CTL | 3) )
901 {
902 case MSR_IA32_MC0_CTL:
903 d->arch.vmca_msrs.mci_ctl[bank] = val;
904 break;
905 case MSR_IA32_MC0_STATUS:
906 /* Give the first entry of the list, it corresponds to current
907 * vMCE# injection. When vMCE# is finished processing by the
908 * the guest, this node will be deleted.
909 * Only error bank is written. Non-error banks simply return.
910 */
911 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
912 {
913 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
914 struct bank_entry, list);
915 if ( entry->bank == bank )
916 entry->mci_status = val;
917 mce_printk(MCE_VERBOSE,
918 "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
919 bank, val);
920 }
921 else
922 mce_printk(MCE_VERBOSE,
923 "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
924 break;
925 case MSR_IA32_MC0_ADDR:
926 mce_printk(MCE_QUIET, "MCE: MC%u_ADDR is read-only\n", bank);
927 ret = -1;
928 break;
929 case MSR_IA32_MC0_MISC:
930 mce_printk(MCE_QUIET, "MCE: MC%u_MISC is read-only\n", bank);
931 ret = -1;
932 break;
933 }
934 break;
935 default:
936 switch ( boot_cpu_data.x86_vendor )
937 {
938 case X86_VENDOR_INTEL:
939 ret = intel_mce_wrmsr(msr, val);
940 break;
941 default:
942 ret = 0;
943 break;
944 }
945 break;
946 }
948 spin_unlock(&d->arch.vmca_msrs.lock);
949 return ret;
950 }
952 static void mcinfo_clear(struct mc_info *mi)
953 {
954 memset(mi, 0, sizeof(struct mc_info));
955 x86_mcinfo_nentries(mi) = 0;
956 }
958 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
959 {
960 int i;
961 unsigned long end1, end2;
962 struct mcinfo_common *mic, *mic_base, *mic_index;
964 mic = (struct mcinfo_common *)mcinfo;
965 mic_index = mic_base = x86_mcinfo_first(mi);
967 /* go to first free entry */
968 for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
969 mic_index = x86_mcinfo_next(mic_index);
970 }
972 /* check if there is enough size */
973 end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
974 end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
976 if (end1 < end2)
977 return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
979 /* there's enough space. add entry. */
980 memcpy(mic_index, mic, mic->size);
981 x86_mcinfo_nentries(mi)++;
983 return 0;
984 }
986 /* Dump machine check information in a format,
987 * mcelog can parse. This is used only when
988 * Dom0 does not take the notification. */
989 void x86_mcinfo_dump(struct mc_info *mi)
990 {
991 struct mcinfo_common *mic = NULL;
992 struct mcinfo_global *mc_global;
993 struct mcinfo_bank *mc_bank;
995 /* first print the global info */
996 x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
997 if (mic == NULL)
998 return;
999 mc_global = (struct mcinfo_global *)mic;
1000 if (mc_global->mc_flags & MC_FLAG_MCE) {
1001 printk(XENLOG_WARNING
1002 "CPU%d: Machine Check Exception: %16"PRIx64"\n",
1003 mc_global->mc_coreid, mc_global->mc_gstatus);
1004 } else {
1005 printk(XENLOG_WARNING "MCE: The hardware reports a non "
1006 "fatal, correctable incident occurred on "
1007 "CPU %d.\n",
1008 mc_global->mc_coreid);
1011 /* then the bank information */
1012 x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
1013 do {
1014 if (mic == NULL)
1015 return;
1016 if (mic->type != MC_TYPE_BANK)
1017 goto next;
1019 mc_bank = (struct mcinfo_bank *)mic;
1021 printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
1022 mc_bank->mc_bank,
1023 mc_bank->mc_status);
1024 if (mc_bank->mc_status & MCi_STATUS_MISCV)
1025 printk("[%16"PRIx64"]", mc_bank->mc_misc);
1026 if (mc_bank->mc_status & MCi_STATUS_ADDRV)
1027 printk(" at %16"PRIx64, mc_bank->mc_addr);
1029 printk("\n");
1030 next:
1031 mic = x86_mcinfo_next(mic); /* next entry */
1032 if ((mic == NULL) || (mic->size == 0))
1033 break;
1034 } while (1);
1037 static void do_mc_get_cpu_info(void *v)
1039 int cpu = smp_processor_id();
1040 int cindex, cpn;
1041 struct cpuinfo_x86 *c;
1042 xen_mc_logical_cpu_t *log_cpus, *xcp;
1043 uint32_t junk, ebx;
1045 log_cpus = v;
1046 c = &cpu_data[cpu];
1047 cindex = 0;
1048 cpn = cpu - 1;
1050 /*
1051 * Deal with sparse masks, condensed into a contig array.
1052 */
1053 while (cpn >= 0) {
1054 if (cpu_isset(cpn, cpu_online_map))
1055 cindex++;
1056 cpn--;
1059 xcp = &log_cpus[cindex];
1060 c = &cpu_data[cpu];
1061 xcp->mc_cpunr = cpu;
1062 x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
1063 &xcp->mc_coreid, &xcp->mc_threadid,
1064 &xcp->mc_apicid, &xcp->mc_ncores,
1065 &xcp->mc_ncores_active, &xcp->mc_nthreads);
1066 xcp->mc_cpuid_level = c->cpuid_level;
1067 xcp->mc_family = c->x86;
1068 xcp->mc_vendor = c->x86_vendor;
1069 xcp->mc_model = c->x86_model;
1070 xcp->mc_step = c->x86_mask;
1071 xcp->mc_cache_size = c->x86_cache_size;
1072 xcp->mc_cache_alignment = c->x86_cache_alignment;
1073 memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
1074 memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
1075 memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
1077 /*
1078 * This part needs to run on the CPU itself.
1079 */
1080 xcp->mc_nmsrvals = __MC_NMSRS;
1081 xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
1082 rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
1084 if (c->cpuid_level >= 1) {
1085 cpuid(1, &junk, &ebx, &junk, &junk);
1086 xcp->mc_clusterid = (ebx >> 24) & 0xff;
1087 } else
1088 xcp->mc_clusterid = hard_smp_processor_id();
1092 void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
1093 uint16_t *threadid, uint32_t *apicid,
1094 unsigned *ncores, unsigned *ncores_active,
1095 unsigned *nthreads)
1097 struct cpuinfo_x86 *c;
1099 *apicid = cpu_physical_id(cpu);
1100 c = &cpu_data[cpu];
1101 if (c->apicid == BAD_APICID) {
1102 *chipid = cpu;
1103 *coreid = 0;
1104 *threadid = 0;
1105 if (ncores != NULL)
1106 *ncores = 1;
1107 if (ncores_active != NULL)
1108 *ncores_active = 1;
1109 if (nthreads != NULL)
1110 *nthreads = 1;
1111 } else {
1112 *chipid = phys_proc_id[cpu];
1113 if (c->x86_max_cores > 1)
1114 *coreid = cpu_core_id[cpu];
1115 else
1116 *coreid = 0;
1117 *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
1118 if (ncores != NULL)
1119 *ncores = c->x86_max_cores;
1120 if (ncores_active != NULL)
1121 *ncores_active = c->booted_cores;
1122 if (nthreads != NULL)
1123 *nthreads = c->x86_num_siblings;
1127 #define INTPOSE_NENT 50
1129 static struct intpose_ent {
1130 unsigned int cpu_nr;
1131 uint64_t msr;
1132 uint64_t val;
1133 } intpose_arr[INTPOSE_NENT];
1135 static void intpose_init(void)
1137 static int done;
1138 int i;
1140 if (done++ > 0)
1141 return;
1143 for (i = 0; i < INTPOSE_NENT; i++) {
1144 intpose_arr[i].cpu_nr = -1;
1149 struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
1150 uint64_t *valp)
1152 int i;
1154 for (i = 0; i < INTPOSE_NENT; i++) {
1155 if (intpose_arr[i].cpu_nr == cpu_nr &&
1156 intpose_arr[i].msr == msr) {
1157 if (valp != NULL)
1158 *valp = intpose_arr[i].val;
1159 return &intpose_arr[i];
1163 return NULL;
1166 static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
1168 struct intpose_ent *ent;
1169 int i;
1171 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1172 ent->val = val;
1173 return;
1176 for (i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++) {
1177 if (ent->cpu_nr == -1) {
1178 ent->cpu_nr = cpu_nr;
1179 ent->msr = msr;
1180 ent->val = val;
1181 return;
1185 printk("intpose_add: interpose array full - request dropped\n");
1188 void intpose_inval(unsigned int cpu_nr, uint64_t msr)
1190 struct intpose_ent *ent;
1192 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1193 ent->cpu_nr = -1;
1197 #define IS_MCA_BANKREG(r) \
1198 ((r) >= MSR_IA32_MC0_CTL && \
1199 (r) <= MSR_IA32_MC0_MISC + (nr_mce_banks - 1) * 4 && \
1200 ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
1202 int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain *d)
1204 int bank_nr;
1206 if ( !bank || !d || !h_mci_ctrl )
1207 return 1;
1209 /* Will MCE happen in host if If host mcg_ctl is 0? */
1210 if ( ~d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl )
1211 return 1;
1213 bank_nr = bank->mc_bank;
1214 if (~d->arch.vmca_msrs.mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] )
1215 return 1;
1216 return 0;
1219 static int x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
1221 struct cpuinfo_x86 *c;
1222 int i, errs = 0;
1224 c = &cpu_data[smp_processor_id()];
1226 for (i = 0; i < mci->mcinj_count; i++) {
1227 uint64_t reg = mci->mcinj_msr[i].reg;
1228 const char *reason = NULL;
1230 if (IS_MCA_BANKREG(reg)) {
1231 if (c->x86_vendor == X86_VENDOR_AMD) {
1232 /* On AMD we can set MCi_STATUS_WREN in the
1233 * HWCR MSR to allow non-zero writes to banks
1234 * MSRs not to #GP. The injector in dom0
1235 * should set that bit, but we detect when it
1236 * is necessary and set it as a courtesy to
1237 * avoid #GP in the hypervisor. */
1238 mci->mcinj_flags |=
1239 _MC_MSRINJ_F_REQ_HWCR_WREN;
1240 continue;
1241 } else {
1242 /* No alternative but to interpose, so require
1243 * that the injector specified as such. */
1244 if (!(mci->mcinj_flags &
1245 MC_MSRINJ_F_INTERPOSE)) {
1246 reason = "must specify interposition";
1249 } else {
1250 switch (reg) {
1251 /* MSRs acceptable on all x86 cpus */
1252 case MSR_IA32_MCG_STATUS:
1253 break;
1255 /* MSRs that the HV will take care of */
1256 case MSR_K8_HWCR:
1257 if (c->x86_vendor == X86_VENDOR_AMD)
1258 reason = "HV will operate HWCR";
1259 else
1260 reason ="only supported on AMD";
1261 break;
1263 default:
1264 reason = "not a recognized MCA MSR";
1265 break;
1269 if (reason != NULL) {
1270 printk("HV MSR INJECT ERROR: MSR 0x%llx %s\n",
1271 (unsigned long long)mci->mcinj_msr[i].reg, reason);
1272 errs++;
1276 return !errs;
1279 static uint64_t x86_mc_hwcr_wren(void)
1281 uint64_t old;
1283 rdmsrl(MSR_K8_HWCR, old);
1285 if (!(old & K8_HWCR_MCi_STATUS_WREN)) {
1286 uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
1287 wrmsrl(MSR_K8_HWCR, new);
1290 return old;
1293 static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
1295 if (!(hwcr & K8_HWCR_MCi_STATUS_WREN))
1296 wrmsrl(MSR_K8_HWCR, hwcr);
1299 static void x86_mc_msrinject(void *data)
1301 struct xen_mc_msrinject *mci = data;
1302 struct mcinfo_msr *msr;
1303 struct cpuinfo_x86 *c;
1304 uint64_t hwcr = 0;
1305 int intpose;
1306 int i;
1308 c = &cpu_data[smp_processor_id()];
1310 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1311 hwcr = x86_mc_hwcr_wren();
1313 intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
1315 for (i = 0, msr = &mci->mcinj_msr[0];
1316 i < mci->mcinj_count; i++, msr++) {
1317 printk("HV MSR INJECT (%s) target %u actual %u MSR 0x%llx "
1318 "<-- 0x%llx\n",
1319 intpose ? "interpose" : "hardware",
1320 mci->mcinj_cpunr, smp_processor_id(),
1321 (unsigned long long)msr->reg,
1322 (unsigned long long)msr->value);
1324 if (intpose)
1325 intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
1326 else
1327 wrmsrl(msr->reg, msr->value);
1330 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1331 x86_mc_hwcr_wren_restore(hwcr);
1334 /*ARGSUSED*/
1335 static void x86_mc_mceinject(void *data)
1337 printk("Simulating #MC on cpu %d\n", smp_processor_id());
1338 __asm__ __volatile__("int $0x12");
1341 #if BITS_PER_LONG == 64
1343 #define ID2COOKIE(id) ((mctelem_cookie_t)(id))
1344 #define COOKIE2ID(c) ((uint64_t)(c))
1346 #elif BITS_PER_LONG == 32
1348 #define ID2COOKIE(id) ((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU))
1349 #define COOKIE2ID(c) ((uint64_t)(uint32_t)(c))
1351 #elif defined(BITS_PER_LONG)
1352 #error BITS_PER_LONG has unexpected value
1353 #else
1354 #error BITS_PER_LONG definition absent
1355 #endif
1357 #ifdef CONFIG_COMPAT
1358 # include <compat/arch-x86/xen-mca.h>
1360 # define xen_mcinfo_msr mcinfo_msr
1361 CHECK_mcinfo_msr;
1362 # undef xen_mcinfo_msr
1363 # undef CHECK_mcinfo_msr
1364 # define CHECK_mcinfo_msr struct mcinfo_msr
1366 # define xen_mcinfo_common mcinfo_common
1367 CHECK_mcinfo_common;
1368 # undef xen_mcinfo_common
1369 # undef CHECK_mcinfo_common
1370 # define CHECK_mcinfo_common struct mcinfo_common
1372 CHECK_FIELD_(struct, mc_fetch, flags);
1373 CHECK_FIELD_(struct, mc_fetch, fetch_id);
1374 # define CHECK_compat_mc_fetch struct mc_fetch
1376 CHECK_FIELD_(struct, mc_physcpuinfo, ncpus);
1377 # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo
1379 CHECK_mc;
1380 # undef CHECK_compat_mc_fetch
1381 # undef CHECK_compat_mc_physcpuinfo
1383 # define xen_mc_info mc_info
1384 CHECK_mc_info;
1385 # undef xen_mc_info
1387 # define xen_mcinfo_global mcinfo_global
1388 CHECK_mcinfo_global;
1389 # undef xen_mcinfo_global
1391 # define xen_mcinfo_bank mcinfo_bank
1392 CHECK_mcinfo_bank;
1393 # undef xen_mcinfo_bank
1395 # define xen_mcinfo_extended mcinfo_extended
1396 CHECK_mcinfo_extended;
1397 # undef xen_mcinfo_extended
1399 # define xen_mcinfo_recovery mcinfo_recovery
1400 # define xen_cpu_offline_action cpu_offline_action
1401 # define xen_page_offline_action page_offline_action
1402 CHECK_mcinfo_recovery;
1403 # undef xen_cpu_offline_action
1404 # undef xen_page_offline_action
1405 # undef xen_mcinfo_recovery
1406 #else
1407 # define compat_mc_fetch xen_mc_fetch
1408 # define compat_mc_physcpuinfo xen_mc_physcpuinfo
1409 # define compat_handle_is_null guest_handle_is_null
1410 # define copy_to_compat copy_to_guest
1411 #endif
1413 /* Machine Check Architecture Hypercall */
1414 long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
1416 long ret = 0;
1417 struct xen_mc curop, *op = &curop;
1418 struct vcpu *v = current;
1419 union {
1420 struct xen_mc_fetch *nat;
1421 struct compat_mc_fetch *cmp;
1422 } mc_fetch;
1423 union {
1424 struct xen_mc_physcpuinfo *nat;
1425 struct compat_mc_physcpuinfo *cmp;
1426 } mc_physcpuinfo;
1427 uint32_t flags, cmdflags;
1428 int nlcpu;
1429 xen_mc_logical_cpu_t *log_cpus = NULL;
1430 mctelem_cookie_t mctc;
1431 mctelem_class_t which;
1432 unsigned int target;
1433 struct xen_mc_msrinject *mc_msrinject;
1434 struct xen_mc_mceinject *mc_mceinject;
1436 if (!IS_PRIV(v->domain) )
1437 return x86_mcerr(NULL, -EPERM);
1439 if ( copy_from_guest(op, u_xen_mc, 1) )
1440 return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
1442 if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
1443 return x86_mcerr("do_mca: interface version mismatch", -EACCES);
1445 switch (op->cmd) {
1446 case XEN_MC_fetch:
1447 mc_fetch.nat = &op->u.mc_fetch;
1448 cmdflags = mc_fetch.nat->flags;
1450 switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
1451 case XEN_MC_NONURGENT:
1452 which = MC_NONURGENT;
1453 break;
1455 case XEN_MC_URGENT:
1456 which = MC_URGENT;
1457 break;
1459 default:
1460 return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
1463 flags = XEN_MC_OK;
1465 if (cmdflags & XEN_MC_ACK) {
1466 mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
1467 mctelem_ack(which, cookie);
1468 } else {
1469 if (!is_pv_32on64_vcpu(v)
1470 ? guest_handle_is_null(mc_fetch.nat->data)
1471 : compat_handle_is_null(mc_fetch.cmp->data))
1472 return x86_mcerr("do_mca fetch: guest buffer "
1473 "invalid", -EINVAL);
1475 if ((mctc = mctelem_consume_oldest_begin(which))) {
1476 struct mc_info *mcip = mctelem_dataptr(mctc);
1477 if (!is_pv_32on64_vcpu(v)
1478 ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
1479 : copy_to_compat(mc_fetch.cmp->data,
1480 mcip, 1)) {
1481 ret = -EFAULT;
1482 flags |= XEN_MC_FETCHFAILED;
1483 mc_fetch.nat->fetch_id = 0;
1484 } else {
1485 mc_fetch.nat->fetch_id = COOKIE2ID(mctc);
1487 mctelem_consume_oldest_end(mctc);
1488 } else {
1489 /* There is no data */
1490 flags |= XEN_MC_NODATA;
1491 mc_fetch.nat->fetch_id = 0;
1494 mc_fetch.nat->flags = flags;
1495 if (copy_to_guest(u_xen_mc, op, 1) != 0)
1496 ret = -EFAULT;
1499 break;
1501 case XEN_MC_notifydomain:
1502 return x86_mcerr("do_mca notify unsupported", -EINVAL);
1504 case XEN_MC_physcpuinfo:
1505 mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
1506 nlcpu = num_online_cpus();
1508 if (!is_pv_32on64_vcpu(v)
1509 ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
1510 : !compat_handle_is_null(mc_physcpuinfo.cmp->info)) {
1511 if (mc_physcpuinfo.nat->ncpus <= 0)
1512 return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
1513 -EINVAL);
1514 nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus);
1515 log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
1516 if (log_cpus == NULL)
1517 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
1519 if (on_each_cpu(do_mc_get_cpu_info, log_cpus, 1)) {
1520 xfree(log_cpus);
1521 return x86_mcerr("do_mca cpuinfo", -EIO);
1523 if (!is_pv_32on64_vcpu(v)
1524 ? copy_to_guest(mc_physcpuinfo.nat->info,
1525 log_cpus, nlcpu)
1526 : copy_to_compat(mc_physcpuinfo.cmp->info,
1527 log_cpus, nlcpu))
1528 ret = -EFAULT;
1529 xfree(log_cpus);
1532 mc_physcpuinfo.nat->ncpus = nlcpu;
1534 if (copy_to_guest(u_xen_mc, op, 1))
1535 return x86_mcerr("do_mca cpuinfo", -EFAULT);
1537 break;
1539 case XEN_MC_msrinject:
1540 if (nr_mce_banks == 0)
1541 return x86_mcerr("do_mca inject", -ENODEV);
1543 mc_msrinject = &op->u.mc_msrinject;
1544 target = mc_msrinject->mcinj_cpunr;
1546 if (target >= NR_CPUS)
1547 return x86_mcerr("do_mca inject: bad target", -EINVAL);
1549 if (!cpu_isset(target, cpu_online_map))
1550 return x86_mcerr("do_mca inject: target offline",
1551 -EINVAL);
1553 if (mc_msrinject->mcinj_count == 0)
1554 return 0;
1556 if (!x86_mc_msrinject_verify(mc_msrinject))
1557 return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
1559 add_taint(TAINT_ERROR_INJECT);
1561 on_selected_cpus(cpumask_of(target), x86_mc_msrinject,
1562 mc_msrinject, 1);
1564 break;
1566 case XEN_MC_mceinject:
1567 if (nr_mce_banks == 0)
1568 return x86_mcerr("do_mca #MC", -ENODEV);
1570 mc_mceinject = &op->u.mc_mceinject;
1571 target = mc_mceinject->mceinj_cpunr;
1573 if (target >= NR_CPUS)
1574 return x86_mcerr("do_mca #MC: bad target", -EINVAL);
1576 if (!cpu_isset(target, cpu_online_map))
1577 return x86_mcerr("do_mca #MC: target offline", -EINVAL);
1579 add_taint(TAINT_ERROR_INJECT);
1581 if ( mce_broadcast )
1582 on_each_cpu(x86_mc_mceinject, mc_mceinject, 0);
1583 else
1584 on_selected_cpus(cpumask_of(target), x86_mc_mceinject,
1585 mc_mceinject, 1);
1586 break;
1588 default:
1589 return x86_mcerr("do_mca: bad command", -EINVAL);
1592 return ret;
1594 void set_poll_bankmask(struct cpuinfo_x86 *c)
1597 if (cmci_support && !mce_disabled) {
1598 memcpy(&(__get_cpu_var(poll_bankmask)),
1599 &(__get_cpu_var(no_cmci_banks)), sizeof(cpu_banks_t));
1601 else {
1602 memcpy(&(get_cpu_var(poll_bankmask)), &mca_allbanks, sizeof(cpu_banks_t));
1603 if (mce_firstbank(c))
1604 clear_bit(0, get_cpu_var(poll_bankmask));
1607 void mc_panic(char *s)
1609 is_mc_panic = 1;
1610 console_force_unlock();
1611 printk("Fatal machine check: %s\n", s);
1612 printk("\n"
1613 "****************************************\n"
1614 "\n"
1615 " The processor has reported a hardware error which cannot\n"
1616 " be recovered from. Xen will now reboot the machine.\n");
1617 panic("HARDWARE ERROR");