debuggers.hg

view xen/arch/x86/cpu/mcheck/mce.c @ 20998:50ea24db1f88

x86/mcheck: do not blindly de-reference dom0 et al

Since machine checks and CMCIs can happen before Dom0 even gets
constructed, the handlers of these events have to avoid de-referencing
respective pointers without checking.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 17 12:04:50 2010 +0000 (2010-02-17)
parents da7ae6d8838a
children 6384675aa29a
line source
1 /*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4 */
6 #include <xen/init.h>
7 #include <xen/types.h>
8 #include <xen/kernel.h>
9 #include <xen/config.h>
10 #include <xen/smp.h>
11 #include <xen/errno.h>
12 #include <xen/console.h>
13 #include <xen/sched.h>
14 #include <xen/sched-if.h>
15 #include <xen/cpumask.h>
16 #include <xen/event.h>
17 #include <xen/guest_access.h>
18 #include <xen/hypercall.h> /* for do_mca */
20 #include <asm/processor.h>
21 #include <asm/system.h>
22 #include <asm/msr.h>
24 #include "mce.h"
26 int mce_disabled;
27 invbool_param("mce", mce_disabled);
28 static int mce_force_broadcast;
29 boolean_param("mce_fb", mce_force_broadcast);
30 int is_mc_panic;
31 unsigned int nr_mce_banks;
33 int mce_broadcast = 0;
34 static uint64_t g_mcg_cap;
36 /* Real value in physical CTL MSR */
37 static uint64_t h_mcg_ctl = 0UL;
38 static uint64_t *h_mci_ctrl;
39 int firstbank;
41 static void intpose_init(void);
42 static void mcinfo_clear(struct mc_info *);
44 #define SEG_PL(segsel) ((segsel) & 0x3)
45 #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16)
47 #if 0
48 static int x86_mcerr(const char *msg, int err)
49 {
50 gdprintk(XENLOG_WARNING, "x86_mcerr: %s, returning %d\n",
51 msg != NULL ? msg : "", err);
52 return err;
53 }
54 #else
55 #define x86_mcerr(msg, err) (err)
56 #endif
58 cpu_banks_t mca_allbanks;
60 int mce_verbosity;
61 static void __init mce_set_verbosity(char *str)
62 {
63 if (strcmp("verbose", str) == 0)
64 mce_verbosity = MCE_VERBOSE;
65 else
66 printk(KERN_DEBUG "Machine Check verbosity level %s not recognised"
67 "use mce_verbosity=verbose", str);
68 }
69 custom_param("mce_verbosity", mce_set_verbosity);
71 /* Handle unconfigured int18 (should never happen) */
72 static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
73 {
74 printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
75 smp_processor_id());
76 }
79 static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
81 void x86_mce_vector_register(x86_mce_vector_t hdlr)
82 {
83 _machine_check_vector = hdlr;
84 wmb();
85 }
87 /* Call the installed machine check handler for this CPU setup. */
89 void machine_check_vector(struct cpu_user_regs *regs, long error_code)
90 {
91 _machine_check_vector(regs, error_code);
92 }
94 /* Init machine check callback handler
95 * It is used to collect additional information provided by newer
96 * CPU families/models without the need to duplicate the whole handler.
97 * This avoids having many handlers doing almost nearly the same and each
98 * with its own tweaks ands bugs. */
99 static x86_mce_callback_t mc_callback_bank_extended = NULL;
101 void x86_mce_callback_register(x86_mce_callback_t cbfunc)
102 {
103 mc_callback_bank_extended = cbfunc;
104 }
106 /* Machine check recoverable judgement callback handler
107 * It is used to judge whether an UC error is recoverable by software
108 */
109 static mce_recoverable_t mc_recoverable_scan = NULL;
111 void mce_recoverable_register(mce_recoverable_t cbfunc)
112 {
113 mc_recoverable_scan = cbfunc;
114 }
116 /* Judging whether to Clear Machine Check error bank callback handler
117 * According to Intel latest MCA OS Recovery Writer's Guide,
118 * whether the error MCA bank needs to be cleared is decided by the mca_source
119 * and MCi_status bit value.
120 */
121 static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
123 void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
124 {
125 mc_need_clearbank_scan = cbfunc;
126 }
128 /* Utility function to perform MCA bank telemetry readout and to push that
129 * telemetry towards an interested dom0 for logging and diagnosis.
130 * The caller - #MC handler or MCA poll function - must arrange that we
131 * do not migrate cpus. */
133 /* XXFM Could add overflow counting? */
135 /* Add out_param clear_bank for Machine Check Handler Caller.
136 * For Intel latest CPU, whether to clear the error bank status needs to
137 * be judged by the callback function defined above.
138 */
139 mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
140 struct mca_summary *sp, cpu_banks_t* clear_bank)
141 {
142 struct vcpu *v = current;
143 struct domain *d;
144 uint64_t gstatus, status, addr, misc;
145 struct mcinfo_global mcg; /* on stack */
146 struct mcinfo_common *mic;
147 struct mcinfo_global *mig; /* on stack */
148 mctelem_cookie_t mctc = NULL;
149 uint32_t uc = 0, pcc = 0, recover, need_clear = 1 ;
150 struct mc_info *mci = NULL;
151 mctelem_class_t which = MC_URGENT; /* XXXgcc */
152 unsigned int cpu_nr;
153 int errcnt = 0;
154 int i;
155 enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
157 cpu_nr = smp_processor_id();
158 BUG_ON(cpu_nr != v->processor);
160 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
162 memset(&mcg, 0, sizeof (mcg));
163 mcg.common.type = MC_TYPE_GLOBAL;
164 mcg.common.size = sizeof (mcg);
165 if (v != NULL && ((d = v->domain) != NULL)) {
166 mcg.mc_domid = d->domain_id;
167 mcg.mc_vcpuid = v->vcpu_id;
168 } else {
169 mcg.mc_domid = -1;
170 mcg.mc_vcpuid = -1;
171 }
172 mcg.mc_gstatus = gstatus; /* MCG_STATUS */
174 switch (who) {
175 case MCA_MCE_HANDLER:
176 case MCA_MCE_SCAN:
177 mcg.mc_flags = MC_FLAG_MCE;
178 which = MC_URGENT;
179 break;
181 case MCA_POLLER:
182 case MCA_RESET:
183 mcg.mc_flags = MC_FLAG_POLLED;
184 which = MC_NONURGENT;
185 break;
187 case MCA_CMCI_HANDLER:
188 mcg.mc_flags = MC_FLAG_CMCI;
189 which = MC_NONURGENT;
190 break;
192 default:
193 BUG();
194 }
196 /* Retrieve detector information */
197 x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
198 &mcg.mc_coreid, &mcg.mc_core_threadid,
199 &mcg.mc_apicid, NULL, NULL, NULL);
201 /* If no mc_recovery_scan callback handler registered,
202 * this error is not recoverable
203 */
204 recover = (mc_recoverable_scan)? 1: 0;
206 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
207 struct mcinfo_bank mcb; /* on stack */
209 /* Skip bank if corresponding bit in bankmask is clear */
210 if (!test_bit(i, bankmask))
211 continue;
213 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
214 if (!(status & MCi_STATUS_VAL))
215 continue; /* this bank has no valid telemetry */
217 /* For Intel Latest CPU CMCI/MCE Handler caller, we need to
218 * decide whether to clear bank by MCi_STATUS bit value such as
219 * OVER/UC/EN/PCC/S/AR
220 */
221 if ( mc_need_clearbank_scan )
222 need_clear = mc_need_clearbank_scan(who, status);
224 /* If this is the first bank with valid MCA DATA, then
225 * try to reserve an entry from the urgent/nonurgent queue
226 * depending on whethere we are called from an exception or
227 * a poller; this can fail (for example dom0 may not
228 * yet have consumed past telemetry). */
229 if (errcnt == 0) {
230 if ((mctc = mctelem_reserve(which)) != NULL) {
231 mci = mctelem_dataptr(mctc);
232 mcinfo_clear(mci);
233 }
234 }
236 memset(&mcb, 0, sizeof (mcb));
237 mcb.common.type = MC_TYPE_BANK;
238 mcb.common.size = sizeof (mcb);
239 mcb.mc_bank = i;
240 mcb.mc_status = status;
242 /* form a mask of which banks have logged uncorrected errors */
243 if ((status & MCi_STATUS_UC) != 0)
244 uc |= (1 << i);
246 /* likewise for those with processor context corrupt */
247 if ((status & MCi_STATUS_PCC) != 0)
248 pcc |= (1 << i);
250 if (recover && uc)
251 /* uc = 1, recover = 1, we need not panic.
252 */
253 recover = mc_recoverable_scan(status);
255 addr = misc = 0;
257 if (status & MCi_STATUS_ADDRV) {
258 mca_rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
259 if (mfn_valid(paddr_to_pfn(addr))) {
260 d = maddr_get_owner(addr);
261 if (d != NULL && (who == MCA_POLLER ||
262 who == MCA_CMCI_HANDLER))
263 mcb.mc_domid = d->domain_id;
264 }
265 }
267 if (status & MCi_STATUS_MISCV)
268 mca_rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
270 mcb.mc_addr = addr;
271 mcb.mc_misc = misc;
273 if (who == MCA_CMCI_HANDLER) {
274 mca_rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
275 rdtscll(mcb.mc_tsc);
276 }
278 /* Increment the error count; if this is the first bank
279 * with a valid error then add the global info to the mcinfo. */
280 if (errcnt++ == 0 && mci != NULL)
281 x86_mcinfo_add(mci, &mcg);
283 /* Add the bank data */
284 if (mci != NULL)
285 x86_mcinfo_add(mci, &mcb);
287 if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
288 cbret = mc_callback_bank_extended(mci, i, status);
289 }
291 /* By default, need_clear = 1 */
292 if (who != MCA_MCE_SCAN && need_clear)
293 /* Clear status */
294 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
295 else if ( who == MCA_MCE_SCAN && need_clear)
296 set_bit(i, clear_bank);
298 wmb();
299 }
301 if (mci != NULL && errcnt > 0) {
302 x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
303 mig = container_of(mic, struct mcinfo_global, common);
304 if (mic == NULL)
305 ;
306 else if (pcc)
307 mig->mc_flags |= MC_FLAG_UNCORRECTABLE;
308 else if (uc)
309 mig->mc_flags |= MC_FLAG_RECOVERABLE;
310 else
311 mig->mc_flags |= MC_FLAG_CORRECTABLE;
312 }
315 if (sp) {
316 sp->errcnt = errcnt;
317 sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
318 sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
319 sp->uc = uc;
320 sp->pcc = pcc;
321 sp->recoverable = recover;
322 }
324 return mci != NULL ? mctc : NULL; /* may be NULL */
325 }
327 #define DOM_NORMAL 0
328 #define DOM0_TRAP 1
329 #define DOMU_TRAP 2
330 #define DOMU_KILLED 4
332 /* Shared #MC handler. */
333 void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
334 cpu_banks_t bankmask)
335 {
336 int xen_state_lost, dom0_state_lost, domU_state_lost;
337 struct vcpu *v = current;
338 struct domain *curdom = v->domain;
339 domid_t domid = curdom->domain_id;
340 int ctx_xen, ctx_dom0, ctx_domU;
341 uint32_t dom_state = DOM_NORMAL;
342 mctelem_cookie_t mctc = NULL;
343 struct mca_summary bs;
344 struct mc_info *mci = NULL;
345 int irqlocked = 0;
346 uint64_t gstatus;
347 int ripv;
349 /* This handler runs as interrupt gate. So IPIs from the
350 * polling service routine are defered until we're finished.
351 */
353 /* Disable interrupts for the _vcpu_. It may not re-scheduled to
354 * another physical CPU. */
355 vcpu_schedule_lock_irq(v);
356 irqlocked = 1;
358 /* Read global status; if it does not indicate machine check
359 * in progress then bail as long as we have a valid ip to return to. */
360 mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
361 ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
362 if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
363 add_taint(TAINT_MACHINE_CHECK); /* questionable */
364 vcpu_schedule_unlock_irq(v);
365 irqlocked = 0;
366 goto cmn_handler_done;
367 }
369 /* Go and grab error telemetry. We must choose whether to commit
370 * for logging or dismiss the cookie that is returned, and must not
371 * reference the cookie after that action.
372 */
373 mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
374 if (mctc != NULL)
375 mci = (struct mc_info *)mctelem_dataptr(mctc);
377 /* Clear MCIP or another #MC will enter shutdown state */
378 gstatus &= ~MCG_STATUS_MCIP;
379 mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
380 wmb();
382 /* If no valid errors and our stack is intact, we're done */
383 if (ripv && bs.errcnt == 0) {
384 vcpu_schedule_unlock_irq(v);
385 irqlocked = 0;
386 goto cmn_handler_done;
387 }
389 if (bs.uc || bs.pcc)
390 add_taint(TAINT_MACHINE_CHECK);
392 /* Machine check exceptions will usually be for UC and/or PCC errors,
393 * but it is possible to configure machine check for some classes
394 * of corrected error.
395 *
396 * UC errors could compromise any domain or the hypervisor
397 * itself - for example a cache writeback of modified data that
398 * turned out to be bad could be for data belonging to anyone, not
399 * just the current domain. In the absence of known data poisoning
400 * to prevent consumption of such bad data in the system we regard
401 * all UC errors as terminal. It may be possible to attempt some
402 * heuristics based on the address affected, which guests have
403 * mappings to that mfn etc.
404 *
405 * PCC errors apply to the current context.
406 *
407 * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
408 * and not PCC is terminal - the return instruction pointer
409 * pushed onto the stack is bogus. If the interrupt context is
410 * the hypervisor or dom0 the game is over, otherwise we can
411 * limit the impact to a single domU but only if we trampoline
412 * somewhere safely - we can't return and unwind the stack.
413 * Since there is no trampoline in place we will treat !RIPV
414 * as terminal for any context.
415 */
416 ctx_xen = SEG_PL(regs->cs) == 0;
417 ctx_dom0 = !ctx_xen && (domid == 0);
418 ctx_domU = !ctx_xen && !ctx_dom0;
420 xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
421 !ripv;
422 dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
423 domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
425 if (xen_state_lost) {
426 /* Now we are going to panic anyway. Allow interrupts, so that
427 * printk on serial console can work. */
428 vcpu_schedule_unlock_irq(v);
429 irqlocked = 0;
431 printk("Terminal machine check exception occurred in "
432 "hypervisor context.\n");
434 /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
435 * to the error then it makes sense to print a stack trace.
436 * That can be useful for more detailed error analysis and/or
437 * error case studies to figure out, if we can clear
438 * xen_impacted and kill a DomU instead
439 * (i.e. if a guest only control structure is affected, but then
440 * we must ensure the bad pages are not re-used again).
441 */
442 if (bs.eipv & MCG_STATUS_EIPV) {
443 printk("MCE: Instruction Pointer is related to the "
444 "error, therefore print the execution state.\n");
445 show_execution_state(regs);
446 }
448 /* Commit the telemetry so that panic flow can find it. */
449 if (mctc != NULL) {
450 x86_mcinfo_dump(mci);
451 mctelem_commit(mctc);
452 }
453 mc_panic("Hypervisor state lost due to machine check "
454 "exception.\n");
455 /*NOTREACHED*/
456 }
458 /*
459 * Xen hypervisor state is intact. If dom0 state is lost then
460 * give it a chance to decide what to do if it has registered
461 * a handler for this event, otherwise panic.
462 *
463 * XXFM Could add some Solaris dom0 contract kill here?
464 */
465 if (dom0_state_lost) {
466 if (dom0 && dom0->max_vcpus && dom0->vcpu[0] &&
467 guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
468 dom_state = DOM0_TRAP;
469 send_guest_trap(dom0, 0, TRAP_machine_check);
470 /* XXFM case of return with !ripv ??? */
471 } else {
472 /* Commit telemetry for panic flow. */
473 if (mctc != NULL) {
474 x86_mcinfo_dump(mci);
475 mctelem_commit(mctc);
476 }
477 mc_panic("Dom0 state lost due to machine check "
478 "exception\n");
479 /*NOTREACHED*/
480 }
481 }
483 /*
484 * If a domU has lost state then send it a trap if it has registered
485 * a handler, otherwise crash the domain.
486 * XXFM Revisit this functionality.
487 */
488 if (domU_state_lost) {
489 if (guest_has_trap_callback(v->domain, v->vcpu_id,
490 TRAP_machine_check)) {
491 dom_state = DOMU_TRAP;
492 send_guest_trap(curdom, v->vcpu_id,
493 TRAP_machine_check);
494 } else {
495 dom_state = DOMU_KILLED;
496 /* Enable interrupts. This basically results in
497 * calling sti on the *physical* cpu. But after
498 * domain_crash() the vcpu pointer is invalid.
499 * Therefore, we must unlock the irqs before killing
500 * it. */
501 vcpu_schedule_unlock_irq(v);
502 irqlocked = 0;
504 /* DomU is impacted. Kill it and continue. */
505 domain_crash(curdom);
506 }
507 }
509 switch (dom_state) {
510 case DOM0_TRAP:
511 case DOMU_TRAP:
512 /* Enable interrupts. */
513 vcpu_schedule_unlock_irq(v);
514 irqlocked = 0;
516 /* guest softirqs and event callbacks are scheduled
517 * immediately after this handler exits. */
518 break;
519 case DOMU_KILLED:
520 /* Nothing to do here. */
521 break;
523 case DOM_NORMAL:
524 vcpu_schedule_unlock_irq(v);
525 irqlocked = 0;
526 break;
527 }
529 cmn_handler_done:
530 BUG_ON(irqlocked);
531 BUG_ON(!ripv);
533 if (bs.errcnt) {
534 /* Not panicing, so forward telemetry to dom0 now if it
535 * is interested. */
536 if (dom0_vmce_enabled()) {
537 if (mctc != NULL)
538 mctelem_commit(mctc);
539 send_guest_global_virq(dom0, VIRQ_MCA);
540 } else {
541 x86_mcinfo_dump(mci);
542 if (mctc != NULL)
543 mctelem_dismiss(mctc);
544 }
545 } else if (mctc != NULL) {
546 mctelem_dismiss(mctc);
547 }
548 }
550 void mcheck_mca_clearbanks(cpu_banks_t bankmask)
551 {
552 int i;
553 uint64_t status;
555 for (i = 0; i < 32 && i < nr_mce_banks; i++) {
556 if (!test_bit(i, bankmask))
557 continue;
558 mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
559 if (!(status & MCi_STATUS_VAL))
560 continue;
561 mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
562 }
563 }
565 static int amd_mcheck_init(struct cpuinfo_x86 *ci)
566 {
567 int rc = 0;
569 switch (ci->x86) {
570 case 6:
571 rc = amd_k7_mcheck_init(ci);
572 break;
574 default:
575 /* Assume that machine check support is available.
576 * The minimum provided support is at least the K8. */
577 case 0xf:
578 rc = amd_k8_mcheck_init(ci);
579 break;
581 case 0x10:
582 case 0x11:
583 rc = amd_f10_mcheck_init(ci);
584 break;
585 }
587 return rc;
588 }
590 /*check the existence of Machine Check*/
591 int mce_available(struct cpuinfo_x86 *c)
592 {
593 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
594 }
596 static int mce_is_broadcast(struct cpuinfo_x86 *c)
597 {
598 if (mce_force_broadcast)
599 return 1;
601 /* According to Intel SDM Dec, 2009, 15.10.4.1, For processors with
602 * DisplayFamily_DisplayModel encoding of 06H_EH and above,
603 * a MCA signal is broadcast to all logical processors in the system
604 */
605 if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
606 c->x86_model >= 0xe)
607 return 1;
608 return 0;
609 }
611 /*
612 * Check if bank 0 is usable for MCE. It isn't for AMD K7,
613 * and Intel P6 family before model 0x1a.
614 */
615 int mce_firstbank(struct cpuinfo_x86 *c)
616 {
617 if (c->x86 == 6) {
618 if (c->x86_vendor == X86_VENDOR_AMD)
619 return 1;
621 if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
622 return 1;
623 }
625 return 0;
626 }
628 /* This has to be run for each processor */
629 void mcheck_init(struct cpuinfo_x86 *c)
630 {
631 int inited = 0, i, broadcast;
632 static int broadcast_check;
634 if (mce_disabled == 1) {
635 dprintk(XENLOG_INFO, "MCE support disabled by bootparam\n");
636 return;
637 }
639 broadcast = mce_is_broadcast(c);
640 if (broadcast_check && (broadcast != mce_broadcast) )
641 dprintk(XENLOG_INFO,
642 "CPUs have mixed broadcast support"
643 "may cause undetermined result!!!\n");
645 broadcast_check = 1;
646 if (broadcast)
647 mce_broadcast = broadcast;
649 for (i = 0; i < MAX_NR_BANKS; i++)
650 set_bit(i,mca_allbanks);
652 /* Enforce at least MCE support in CPUID information. Individual
653 * families may also need to enforce a check for MCA support. */
654 if (!cpu_has(c, X86_FEATURE_MCE)) {
655 printk(XENLOG_INFO "CPU%i: No machine check support available\n",
656 smp_processor_id());
657 return;
658 }
660 intpose_init();
661 mctelem_init(sizeof (struct mc_info));
663 switch (c->x86_vendor) {
664 case X86_VENDOR_AMD:
665 inited = amd_mcheck_init(c);
666 break;
668 case X86_VENDOR_INTEL:
669 switch (c->x86) {
670 case 6:
671 case 15:
672 inited = intel_mcheck_init(c);
673 break;
674 }
675 break;
677 default:
678 break;
679 }
681 if ( !h_mci_ctrl )
682 {
683 h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks);
684 if (!h_mci_ctrl)
685 {
686 dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n");
687 return;
688 }
689 /* Don't care banks before firstbank */
690 memset(h_mci_ctrl, 0xff, sizeof(h_mci_ctrl));
691 for (i = firstbank; i < nr_mce_banks; i++)
692 rdmsrl(MSR_IA32_MC0_CTL + 4*i, h_mci_ctrl[i]);
693 }
694 if (g_mcg_cap & MCG_CTL_P)
695 rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl);
696 set_poll_bankmask(c);
697 if (!inited)
698 printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
699 smp_processor_id());
700 }
702 u64 mce_cap_init(void)
703 {
704 u32 l, h;
705 u64 value;
707 rdmsr(MSR_IA32_MCG_CAP, l, h);
708 value = ((u64)h << 32) | l;
709 /* For Guest vMCE usage */
710 g_mcg_cap = value & ~MCG_CMCI_P;
712 if (l & MCG_CTL_P) /* Control register present ? */
713 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
715 nr_mce_banks = l & MCG_CAP_COUNT;
716 if ( nr_mce_banks > MAX_NR_BANKS )
717 {
718 printk(KERN_WARNING "MCE: exceed max mce banks\n");
719 g_mcg_cap = (g_mcg_cap & ~MCG_CAP_COUNT) | MAX_NR_BANKS;
720 }
722 return value;
723 }
725 /* Guest vMCE# MSRs virtualization ops (rdmsr/wrmsr) */
726 void mce_init_msr(struct domain *d)
727 {
728 d->arch.vmca_msrs.mcg_status = 0x0;
729 d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
730 d->arch.vmca_msrs.mcg_ctl = ~(uint64_t)0x0;
731 d->arch.vmca_msrs.nr_injection = 0;
732 memset(d->arch.vmca_msrs.mci_ctl, ~0,
733 sizeof(d->arch.vmca_msrs.mci_ctl));
734 INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
735 spin_lock_init(&d->arch.vmca_msrs.lock);
736 }
738 int mce_rdmsr(uint32_t msr, uint64_t *val)
739 {
740 struct domain *d = current->domain;
741 int ret = 1;
742 unsigned int bank;
743 struct bank_entry *entry = NULL;
745 *val = 0;
746 spin_lock(&d->arch.vmca_msrs.lock);
748 switch ( msr )
749 {
750 case MSR_IA32_MCG_STATUS:
751 *val = d->arch.vmca_msrs.mcg_status;
752 if (*val)
753 mce_printk(MCE_VERBOSE,
754 "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
755 break;
756 case MSR_IA32_MCG_CAP:
757 *val = d->arch.vmca_msrs.mcg_cap;
758 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
759 *val);
760 break;
761 case MSR_IA32_MCG_CTL:
762 /* Always 0 if no CTL support */
763 *val = d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl;
764 mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
765 *val);
766 break;
767 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
768 bank = (msr - MSR_IA32_MC0_CTL) / 4;
769 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
770 {
771 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
772 ret = 0;
773 break;
774 }
775 switch (msr & (MSR_IA32_MC0_CTL | 3))
776 {
777 case MSR_IA32_MC0_CTL:
778 *val = d->arch.vmca_msrs.mci_ctl[bank] &
779 (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
780 mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
781 bank, *val);
782 break;
783 case MSR_IA32_MC0_STATUS:
784 /* Only error bank is read. Non-error banks simply return. */
785 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
786 {
787 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
788 struct bank_entry, list);
789 if (entry->bank == bank) {
790 *val = entry->mci_status;
791 mce_printk(MCE_VERBOSE,
792 "MCE: rd MC%u_STATUS in vMCE# context "
793 "value 0x%"PRIx64"\n", bank, *val);
794 }
795 else
796 entry = NULL;
797 }
798 break;
799 case MSR_IA32_MC0_ADDR:
800 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
801 {
802 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
803 struct bank_entry, list);
804 if ( entry->bank == bank )
805 {
806 *val = entry->mci_addr;
807 mce_printk(MCE_VERBOSE,
808 "MCE: rdmsr MC%u_ADDR in vMCE# context "
809 "0x%"PRIx64"\n", bank, *val);
810 }
811 }
812 break;
813 case MSR_IA32_MC0_MISC:
814 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
815 {
816 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
817 struct bank_entry, list);
818 if ( entry->bank == bank )
819 {
820 *val = entry->mci_misc;
821 mce_printk(MCE_VERBOSE,
822 "MCE: rd MC%u_MISC in vMCE# context "
823 "0x%"PRIx64"\n", bank, *val);
824 }
825 }
826 break;
827 }
828 break;
829 default:
830 switch ( boot_cpu_data.x86_vendor )
831 {
832 case X86_VENDOR_INTEL:
833 ret = intel_mce_rdmsr(msr, val);
834 break;
835 default:
836 ret = 0;
837 break;
838 }
839 break;
840 }
842 spin_unlock(&d->arch.vmca_msrs.lock);
843 return ret;
844 }
846 int mce_wrmsr(u32 msr, u64 val)
847 {
848 struct domain *d = current->domain;
849 struct bank_entry *entry = NULL;
850 unsigned int bank;
851 int ret = 1;
853 if ( !g_mcg_cap )
854 return 0;
856 spin_lock(&d->arch.vmca_msrs.lock);
858 switch ( msr )
859 {
860 case MSR_IA32_MCG_CTL:
861 d->arch.vmca_msrs.mcg_ctl = val;
862 break;
863 case MSR_IA32_MCG_STATUS:
864 d->arch.vmca_msrs.mcg_status = val;
865 mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
866 /* For HVM guest, this is the point for deleting vMCE injection node */
867 if ( d->is_hvm && (d->arch.vmca_msrs.nr_injection > 0) )
868 {
869 d->arch.vmca_msrs.nr_injection--; /* Should be 0 */
870 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
871 {
872 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
873 struct bank_entry, list);
874 if ( entry->mci_status & MCi_STATUS_VAL )
875 mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have "
876 "been cleared before write MCG_STATUS MSR\n");
878 mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
879 "Node, nr_injection %u\n",
880 d->arch.vmca_msrs.nr_injection);
881 list_del(&entry->list);
882 xfree(entry);
883 }
884 else
885 mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
886 " last injection Node, something Wrong!\n");
887 }
888 break;
889 case MSR_IA32_MCG_CAP:
890 mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
891 ret = -1;
892 break;
893 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
894 bank = (msr - MSR_IA32_MC0_CTL) / 4;
895 if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
896 {
897 mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
898 ret = 0;
899 break;
900 }
901 switch ( msr & (MSR_IA32_MC0_CTL | 3) )
902 {
903 case MSR_IA32_MC0_CTL:
904 d->arch.vmca_msrs.mci_ctl[bank] = val;
905 break;
906 case MSR_IA32_MC0_STATUS:
907 /* Give the first entry of the list, it corresponds to current
908 * vMCE# injection. When vMCE# is finished processing by the
909 * the guest, this node will be deleted.
910 * Only error bank is written. Non-error banks simply return.
911 */
912 if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
913 {
914 entry = list_entry(d->arch.vmca_msrs.impact_header.next,
915 struct bank_entry, list);
916 if ( entry->bank == bank )
917 entry->mci_status = val;
918 mce_printk(MCE_VERBOSE,
919 "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
920 bank, val);
921 }
922 else
923 mce_printk(MCE_VERBOSE,
924 "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
925 break;
926 case MSR_IA32_MC0_ADDR:
927 mce_printk(MCE_QUIET, "MCE: MC%u_ADDR is read-only\n", bank);
928 ret = -1;
929 break;
930 case MSR_IA32_MC0_MISC:
931 mce_printk(MCE_QUIET, "MCE: MC%u_MISC is read-only\n", bank);
932 ret = -1;
933 break;
934 }
935 break;
936 default:
937 switch ( boot_cpu_data.x86_vendor )
938 {
939 case X86_VENDOR_INTEL:
940 ret = intel_mce_wrmsr(msr, val);
941 break;
942 default:
943 ret = 0;
944 break;
945 }
946 break;
947 }
949 spin_unlock(&d->arch.vmca_msrs.lock);
950 return ret;
951 }
953 static void mcinfo_clear(struct mc_info *mi)
954 {
955 memset(mi, 0, sizeof(struct mc_info));
956 x86_mcinfo_nentries(mi) = 0;
957 }
959 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
960 {
961 int i;
962 unsigned long end1, end2;
963 struct mcinfo_common *mic, *mic_base, *mic_index;
965 mic = (struct mcinfo_common *)mcinfo;
966 mic_index = mic_base = x86_mcinfo_first(mi);
968 /* go to first free entry */
969 for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
970 mic_index = x86_mcinfo_next(mic_index);
971 }
973 /* check if there is enough size */
974 end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
975 end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
977 if (end1 < end2)
978 return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
980 /* there's enough space. add entry. */
981 memcpy(mic_index, mic, mic->size);
982 x86_mcinfo_nentries(mi)++;
984 return 0;
985 }
987 /* Dump machine check information in a format,
988 * mcelog can parse. This is used only when
989 * Dom0 does not take the notification. */
990 void x86_mcinfo_dump(struct mc_info *mi)
991 {
992 struct mcinfo_common *mic = NULL;
993 struct mcinfo_global *mc_global;
994 struct mcinfo_bank *mc_bank;
996 /* first print the global info */
997 x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
998 if (mic == NULL)
999 return;
1000 mc_global = (struct mcinfo_global *)mic;
1001 if (mc_global->mc_flags & MC_FLAG_MCE) {
1002 printk(XENLOG_WARNING
1003 "CPU%d: Machine Check Exception: %16"PRIx64"\n",
1004 mc_global->mc_coreid, mc_global->mc_gstatus);
1005 } else {
1006 printk(XENLOG_WARNING "MCE: The hardware reports a non "
1007 "fatal, correctable incident occurred on "
1008 "CPU %d.\n",
1009 mc_global->mc_coreid);
1012 /* then the bank information */
1013 x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
1014 do {
1015 if (mic == NULL)
1016 return;
1017 if (mic->type != MC_TYPE_BANK)
1018 goto next;
1020 mc_bank = (struct mcinfo_bank *)mic;
1022 printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
1023 mc_bank->mc_bank,
1024 mc_bank->mc_status);
1025 if (mc_bank->mc_status & MCi_STATUS_MISCV)
1026 printk("[%16"PRIx64"]", mc_bank->mc_misc);
1027 if (mc_bank->mc_status & MCi_STATUS_ADDRV)
1028 printk(" at %16"PRIx64, mc_bank->mc_addr);
1030 printk("\n");
1031 next:
1032 mic = x86_mcinfo_next(mic); /* next entry */
1033 if ((mic == NULL) || (mic->size == 0))
1034 break;
1035 } while (1);
1038 static void do_mc_get_cpu_info(void *v)
1040 int cpu = smp_processor_id();
1041 int cindex, cpn;
1042 struct cpuinfo_x86 *c;
1043 xen_mc_logical_cpu_t *log_cpus, *xcp;
1044 uint32_t junk, ebx;
1046 log_cpus = v;
1047 c = &cpu_data[cpu];
1048 cindex = 0;
1049 cpn = cpu - 1;
1051 /*
1052 * Deal with sparse masks, condensed into a contig array.
1053 */
1054 while (cpn >= 0) {
1055 if (cpu_isset(cpn, cpu_online_map))
1056 cindex++;
1057 cpn--;
1060 xcp = &log_cpus[cindex];
1061 c = &cpu_data[cpu];
1062 xcp->mc_cpunr = cpu;
1063 x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
1064 &xcp->mc_coreid, &xcp->mc_threadid,
1065 &xcp->mc_apicid, &xcp->mc_ncores,
1066 &xcp->mc_ncores_active, &xcp->mc_nthreads);
1067 xcp->mc_cpuid_level = c->cpuid_level;
1068 xcp->mc_family = c->x86;
1069 xcp->mc_vendor = c->x86_vendor;
1070 xcp->mc_model = c->x86_model;
1071 xcp->mc_step = c->x86_mask;
1072 xcp->mc_cache_size = c->x86_cache_size;
1073 xcp->mc_cache_alignment = c->x86_cache_alignment;
1074 memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
1075 memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
1076 memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
1078 /*
1079 * This part needs to run on the CPU itself.
1080 */
1081 xcp->mc_nmsrvals = __MC_NMSRS;
1082 xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
1083 rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
1085 if (c->cpuid_level >= 1) {
1086 cpuid(1, &junk, &ebx, &junk, &junk);
1087 xcp->mc_clusterid = (ebx >> 24) & 0xff;
1088 } else
1089 xcp->mc_clusterid = hard_smp_processor_id();
1093 void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
1094 uint16_t *threadid, uint32_t *apicid,
1095 unsigned *ncores, unsigned *ncores_active,
1096 unsigned *nthreads)
1098 struct cpuinfo_x86 *c;
1100 *apicid = cpu_physical_id(cpu);
1101 c = &cpu_data[cpu];
1102 if (c->apicid == BAD_APICID) {
1103 *chipid = cpu;
1104 *coreid = 0;
1105 *threadid = 0;
1106 if (ncores != NULL)
1107 *ncores = 1;
1108 if (ncores_active != NULL)
1109 *ncores_active = 1;
1110 if (nthreads != NULL)
1111 *nthreads = 1;
1112 } else {
1113 *chipid = phys_proc_id[cpu];
1114 if (c->x86_max_cores > 1)
1115 *coreid = cpu_core_id[cpu];
1116 else
1117 *coreid = 0;
1118 *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
1119 if (ncores != NULL)
1120 *ncores = c->x86_max_cores;
1121 if (ncores_active != NULL)
1122 *ncores_active = c->booted_cores;
1123 if (nthreads != NULL)
1124 *nthreads = c->x86_num_siblings;
1128 #define INTPOSE_NENT 50
1130 static struct intpose_ent {
1131 unsigned int cpu_nr;
1132 uint64_t msr;
1133 uint64_t val;
1134 } intpose_arr[INTPOSE_NENT];
1136 static void intpose_init(void)
1138 static int done;
1139 int i;
1141 if (done++ > 0)
1142 return;
1144 for (i = 0; i < INTPOSE_NENT; i++) {
1145 intpose_arr[i].cpu_nr = -1;
1150 struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
1151 uint64_t *valp)
1153 int i;
1155 for (i = 0; i < INTPOSE_NENT; i++) {
1156 if (intpose_arr[i].cpu_nr == cpu_nr &&
1157 intpose_arr[i].msr == msr) {
1158 if (valp != NULL)
1159 *valp = intpose_arr[i].val;
1160 return &intpose_arr[i];
1164 return NULL;
1167 static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
1169 struct intpose_ent *ent;
1170 int i;
1172 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1173 ent->val = val;
1174 return;
1177 for (i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++) {
1178 if (ent->cpu_nr == -1) {
1179 ent->cpu_nr = cpu_nr;
1180 ent->msr = msr;
1181 ent->val = val;
1182 return;
1186 printk("intpose_add: interpose array full - request dropped\n");
1189 void intpose_inval(unsigned int cpu_nr, uint64_t msr)
1191 struct intpose_ent *ent;
1193 if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
1194 ent->cpu_nr = -1;
1198 #define IS_MCA_BANKREG(r) \
1199 ((r) >= MSR_IA32_MC0_CTL && \
1200 (r) <= MSR_IA32_MC0_MISC + (nr_mce_banks - 1) * 4 && \
1201 ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
1203 int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain *d)
1205 int bank_nr;
1207 if ( !bank || !d || !h_mci_ctrl )
1208 return 1;
1210 /* Will MCE happen in host if If host mcg_ctl is 0? */
1211 if ( ~d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl )
1212 return 1;
1214 bank_nr = bank->mc_bank;
1215 if (~d->arch.vmca_msrs.mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] )
1216 return 1;
1217 return 0;
1220 static int x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
1222 struct cpuinfo_x86 *c;
1223 int i, errs = 0;
1225 c = &cpu_data[smp_processor_id()];
1227 for (i = 0; i < mci->mcinj_count; i++) {
1228 uint64_t reg = mci->mcinj_msr[i].reg;
1229 const char *reason = NULL;
1231 if (IS_MCA_BANKREG(reg)) {
1232 if (c->x86_vendor == X86_VENDOR_AMD) {
1233 /* On AMD we can set MCi_STATUS_WREN in the
1234 * HWCR MSR to allow non-zero writes to banks
1235 * MSRs not to #GP. The injector in dom0
1236 * should set that bit, but we detect when it
1237 * is necessary and set it as a courtesy to
1238 * avoid #GP in the hypervisor. */
1239 mci->mcinj_flags |=
1240 _MC_MSRINJ_F_REQ_HWCR_WREN;
1241 continue;
1242 } else {
1243 /* No alternative but to interpose, so require
1244 * that the injector specified as such. */
1245 if (!(mci->mcinj_flags &
1246 MC_MSRINJ_F_INTERPOSE)) {
1247 reason = "must specify interposition";
1250 } else {
1251 switch (reg) {
1252 /* MSRs acceptable on all x86 cpus */
1253 case MSR_IA32_MCG_STATUS:
1254 break;
1256 /* MSRs that the HV will take care of */
1257 case MSR_K8_HWCR:
1258 if (c->x86_vendor == X86_VENDOR_AMD)
1259 reason = "HV will operate HWCR";
1260 else
1261 reason ="only supported on AMD";
1262 break;
1264 default:
1265 reason = "not a recognized MCA MSR";
1266 break;
1270 if (reason != NULL) {
1271 printk("HV MSR INJECT ERROR: MSR 0x%llx %s\n",
1272 (unsigned long long)mci->mcinj_msr[i].reg, reason);
1273 errs++;
1277 return !errs;
1280 static uint64_t x86_mc_hwcr_wren(void)
1282 uint64_t old;
1284 rdmsrl(MSR_K8_HWCR, old);
1286 if (!(old & K8_HWCR_MCi_STATUS_WREN)) {
1287 uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
1288 wrmsrl(MSR_K8_HWCR, new);
1291 return old;
1294 static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
1296 if (!(hwcr & K8_HWCR_MCi_STATUS_WREN))
1297 wrmsrl(MSR_K8_HWCR, hwcr);
1300 static void x86_mc_msrinject(void *data)
1302 struct xen_mc_msrinject *mci = data;
1303 struct mcinfo_msr *msr;
1304 struct cpuinfo_x86 *c;
1305 uint64_t hwcr = 0;
1306 int intpose;
1307 int i;
1309 c = &cpu_data[smp_processor_id()];
1311 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1312 hwcr = x86_mc_hwcr_wren();
1314 intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
1316 for (i = 0, msr = &mci->mcinj_msr[0];
1317 i < mci->mcinj_count; i++, msr++) {
1318 printk("HV MSR INJECT (%s) target %u actual %u MSR 0x%llx "
1319 "<-- 0x%llx\n",
1320 intpose ? "interpose" : "hardware",
1321 mci->mcinj_cpunr, smp_processor_id(),
1322 (unsigned long long)msr->reg,
1323 (unsigned long long)msr->value);
1325 if (intpose)
1326 intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
1327 else
1328 wrmsrl(msr->reg, msr->value);
1331 if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
1332 x86_mc_hwcr_wren_restore(hwcr);
1335 /*ARGSUSED*/
1336 static void x86_mc_mceinject(void *data)
1338 printk("Simulating #MC on cpu %d\n", smp_processor_id());
1339 __asm__ __volatile__("int $0x12");
1342 #if BITS_PER_LONG == 64
1344 #define ID2COOKIE(id) ((mctelem_cookie_t)(id))
1345 #define COOKIE2ID(c) ((uint64_t)(c))
1347 #elif BITS_PER_LONG == 32
1349 #define ID2COOKIE(id) ((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU))
1350 #define COOKIE2ID(c) ((uint64_t)(uint32_t)(c))
1352 #elif defined(BITS_PER_LONG)
1353 #error BITS_PER_LONG has unexpected value
1354 #else
1355 #error BITS_PER_LONG definition absent
1356 #endif
1358 #ifdef CONFIG_COMPAT
1359 # include <compat/arch-x86/xen-mca.h>
1361 # define xen_mcinfo_msr mcinfo_msr
1362 CHECK_mcinfo_msr;
1363 # undef xen_mcinfo_msr
1364 # undef CHECK_mcinfo_msr
1365 # define CHECK_mcinfo_msr struct mcinfo_msr
1367 # define xen_mcinfo_common mcinfo_common
1368 CHECK_mcinfo_common;
1369 # undef xen_mcinfo_common
1370 # undef CHECK_mcinfo_common
1371 # define CHECK_mcinfo_common struct mcinfo_common
1373 CHECK_FIELD_(struct, mc_fetch, flags);
1374 CHECK_FIELD_(struct, mc_fetch, fetch_id);
1375 # define CHECK_compat_mc_fetch struct mc_fetch
1377 CHECK_FIELD_(struct, mc_physcpuinfo, ncpus);
1378 # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo
1380 CHECK_mc;
1381 # undef CHECK_compat_mc_fetch
1382 # undef CHECK_compat_mc_physcpuinfo
1384 # define xen_mc_info mc_info
1385 CHECK_mc_info;
1386 # undef xen_mc_info
1388 # define xen_mcinfo_global mcinfo_global
1389 CHECK_mcinfo_global;
1390 # undef xen_mcinfo_global
1392 # define xen_mcinfo_bank mcinfo_bank
1393 CHECK_mcinfo_bank;
1394 # undef xen_mcinfo_bank
1396 # define xen_mcinfo_extended mcinfo_extended
1397 CHECK_mcinfo_extended;
1398 # undef xen_mcinfo_extended
1400 # define xen_mcinfo_recovery mcinfo_recovery
1401 # define xen_cpu_offline_action cpu_offline_action
1402 # define xen_page_offline_action page_offline_action
1403 CHECK_mcinfo_recovery;
1404 # undef xen_cpu_offline_action
1405 # undef xen_page_offline_action
1406 # undef xen_mcinfo_recovery
1407 #else
1408 # define compat_mc_fetch xen_mc_fetch
1409 # define compat_mc_physcpuinfo xen_mc_physcpuinfo
1410 # define compat_handle_is_null guest_handle_is_null
1411 # define copy_to_compat copy_to_guest
1412 #endif
1414 /* Machine Check Architecture Hypercall */
1415 long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
1417 long ret = 0;
1418 struct xen_mc curop, *op = &curop;
1419 struct vcpu *v = current;
1420 union {
1421 struct xen_mc_fetch *nat;
1422 struct compat_mc_fetch *cmp;
1423 } mc_fetch;
1424 union {
1425 struct xen_mc_physcpuinfo *nat;
1426 struct compat_mc_physcpuinfo *cmp;
1427 } mc_physcpuinfo;
1428 uint32_t flags, cmdflags;
1429 int nlcpu;
1430 xen_mc_logical_cpu_t *log_cpus = NULL;
1431 mctelem_cookie_t mctc;
1432 mctelem_class_t which;
1433 unsigned int target;
1434 struct xen_mc_msrinject *mc_msrinject;
1435 struct xen_mc_mceinject *mc_mceinject;
1437 if (!IS_PRIV(v->domain) )
1438 return x86_mcerr(NULL, -EPERM);
1440 if ( copy_from_guest(op, u_xen_mc, 1) )
1441 return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
1443 if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
1444 return x86_mcerr("do_mca: interface version mismatch", -EACCES);
1446 switch (op->cmd) {
1447 case XEN_MC_fetch:
1448 mc_fetch.nat = &op->u.mc_fetch;
1449 cmdflags = mc_fetch.nat->flags;
1451 switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
1452 case XEN_MC_NONURGENT:
1453 which = MC_NONURGENT;
1454 break;
1456 case XEN_MC_URGENT:
1457 which = MC_URGENT;
1458 break;
1460 default:
1461 return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
1464 flags = XEN_MC_OK;
1466 if (cmdflags & XEN_MC_ACK) {
1467 mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
1468 mctelem_ack(which, cookie);
1469 } else {
1470 if (!is_pv_32on64_vcpu(v)
1471 ? guest_handle_is_null(mc_fetch.nat->data)
1472 : compat_handle_is_null(mc_fetch.cmp->data))
1473 return x86_mcerr("do_mca fetch: guest buffer "
1474 "invalid", -EINVAL);
1476 if ((mctc = mctelem_consume_oldest_begin(which))) {
1477 struct mc_info *mcip = mctelem_dataptr(mctc);
1478 if (!is_pv_32on64_vcpu(v)
1479 ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
1480 : copy_to_compat(mc_fetch.cmp->data,
1481 mcip, 1)) {
1482 ret = -EFAULT;
1483 flags |= XEN_MC_FETCHFAILED;
1484 mc_fetch.nat->fetch_id = 0;
1485 } else {
1486 mc_fetch.nat->fetch_id = COOKIE2ID(mctc);
1488 mctelem_consume_oldest_end(mctc);
1489 } else {
1490 /* There is no data */
1491 flags |= XEN_MC_NODATA;
1492 mc_fetch.nat->fetch_id = 0;
1495 mc_fetch.nat->flags = flags;
1496 if (copy_to_guest(u_xen_mc, op, 1) != 0)
1497 ret = -EFAULT;
1500 break;
1502 case XEN_MC_notifydomain:
1503 return x86_mcerr("do_mca notify unsupported", -EINVAL);
1505 case XEN_MC_physcpuinfo:
1506 mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
1507 nlcpu = num_online_cpus();
1509 if (!is_pv_32on64_vcpu(v)
1510 ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
1511 : !compat_handle_is_null(mc_physcpuinfo.cmp->info)) {
1512 if (mc_physcpuinfo.nat->ncpus <= 0)
1513 return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
1514 -EINVAL);
1515 nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus);
1516 log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
1517 if (log_cpus == NULL)
1518 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
1520 if (on_each_cpu(do_mc_get_cpu_info, log_cpus, 1)) {
1521 xfree(log_cpus);
1522 return x86_mcerr("do_mca cpuinfo", -EIO);
1524 if (!is_pv_32on64_vcpu(v)
1525 ? copy_to_guest(mc_physcpuinfo.nat->info,
1526 log_cpus, nlcpu)
1527 : copy_to_compat(mc_physcpuinfo.cmp->info,
1528 log_cpus, nlcpu))
1529 ret = -EFAULT;
1530 xfree(log_cpus);
1533 mc_physcpuinfo.nat->ncpus = nlcpu;
1535 if (copy_to_guest(u_xen_mc, op, 1))
1536 return x86_mcerr("do_mca cpuinfo", -EFAULT);
1538 break;
1540 case XEN_MC_msrinject:
1541 if (nr_mce_banks == 0)
1542 return x86_mcerr("do_mca inject", -ENODEV);
1544 mc_msrinject = &op->u.mc_msrinject;
1545 target = mc_msrinject->mcinj_cpunr;
1547 if (target >= NR_CPUS)
1548 return x86_mcerr("do_mca inject: bad target", -EINVAL);
1550 if (!cpu_isset(target, cpu_online_map))
1551 return x86_mcerr("do_mca inject: target offline",
1552 -EINVAL);
1554 if (mc_msrinject->mcinj_count == 0)
1555 return 0;
1557 if (!x86_mc_msrinject_verify(mc_msrinject))
1558 return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
1560 add_taint(TAINT_ERROR_INJECT);
1562 on_selected_cpus(cpumask_of(target), x86_mc_msrinject,
1563 mc_msrinject, 1);
1565 break;
1567 case XEN_MC_mceinject:
1568 if (nr_mce_banks == 0)
1569 return x86_mcerr("do_mca #MC", -ENODEV);
1571 mc_mceinject = &op->u.mc_mceinject;
1572 target = mc_mceinject->mceinj_cpunr;
1574 if (target >= NR_CPUS)
1575 return x86_mcerr("do_mca #MC: bad target", -EINVAL);
1577 if (!cpu_isset(target, cpu_online_map))
1578 return x86_mcerr("do_mca #MC: target offline", -EINVAL);
1580 add_taint(TAINT_ERROR_INJECT);
1582 if ( mce_broadcast )
1583 on_each_cpu(x86_mc_mceinject, mc_mceinject, 0);
1584 else
1585 on_selected_cpus(cpumask_of(target), x86_mc_mceinject,
1586 mc_mceinject, 1);
1587 break;
1589 default:
1590 return x86_mcerr("do_mca: bad command", -EINVAL);
1593 return ret;
1595 void set_poll_bankmask(struct cpuinfo_x86 *c)
1598 if (cmci_support && !mce_disabled) {
1599 memcpy(&(__get_cpu_var(poll_bankmask)),
1600 &(__get_cpu_var(no_cmci_banks)), sizeof(cpu_banks_t));
1602 else {
1603 memcpy(&(get_cpu_var(poll_bankmask)), &mca_allbanks, sizeof(cpu_banks_t));
1604 if (mce_firstbank(c))
1605 clear_bit(0, get_cpu_var(poll_bankmask));
1608 void mc_panic(char *s)
1610 is_mc_panic = 1;
1611 console_force_unlock();
1612 printk("Fatal machine check: %s\n", s);
1613 printk("\n"
1614 "****************************************\n"
1615 "\n"
1616 " The processor has reported a hardware error which cannot\n"
1617 " be recovered from. Xen will now reboot the machine.\n");
1618 panic("HARDWARE ERROR");