1 /*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4 */
5
6 #include <xen/init.h>
7 #include <xen/types.h>
8 #include <xen/kernel.h>
9 #include <xen/param.h>
10 #include <xen/smp.h>
11 #include <xen/errno.h>
12 #include <xen/console.h>
13 #include <xen/sched.h>
14 #include <xen/cpumask.h>
15 #include <xen/event.h>
16 #include <xen/guest_access.h>
17 #include <xen/hypercall.h> /* for do_mca */
18 #include <xen/cpu.h>
19
20 #include <asm/processor.h>
21 #include <asm/setup.h>
22 #include <asm/system.h>
23 #include <asm/apic.h>
24 #include <asm/msr.h>
25 #include <asm/p2m.h>
26
27 #include "mce.h"
28 #include "barrier.h"
29 #include "mcaction.h"
30 #include "util.h"
31 #include "vmce.h"
32
33 bool __read_mostly opt_mce = true;
34 boolean_param("mce", opt_mce);
35 bool __read_mostly mce_broadcast;
36 bool is_mc_panic;
37 DEFINE_PER_CPU_READ_MOSTLY(unsigned int, nr_mce_banks);
38 unsigned int __read_mostly firstbank;
39 unsigned int __read_mostly ppin_msr;
40 uint8_t __read_mostly cmci_apic_vector;
41
42 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask);
43 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks);
44 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks);
45
46 static void intpose_init(void);
47 static void mcinfo_clear(struct mc_info *);
48 struct mca_banks *mca_allbanks;
49
50 #define SEG_PL(segsel) ((segsel) & 0x3)
51 #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16)
52
53 #if 0
54 #define x86_mcerr(fmt, err, args...) \
55 ({ \
56 int _err = (err); \
57 gdprintk(XENLOG_WARNING, "x86_mcerr: " fmt ", returning %d\n", \
58 ## args, _err); \
59 _err; \
60 })
61 #else
62 #define x86_mcerr(fmt, err, args...) (err)
63 #endif
64
65 int mce_verbosity;
mce_set_verbosity(const char * str)66 static int __init mce_set_verbosity(const char *str)
67 {
68 if ( strcmp("verbose", str) == 0 )
69 mce_verbosity = MCE_VERBOSE;
70 else
71 return -EINVAL;
72
73 return 0;
74 }
75 custom_param("mce_verbosity", mce_set_verbosity);
76
77 /* Handle unconfigured int18 (should never happen) */
unexpected_machine_check(const struct cpu_user_regs * regs)78 static void unexpected_machine_check(const struct cpu_user_regs *regs)
79 {
80 console_force_unlock();
81 printk("Unexpected Machine Check Exception\n");
82 fatal_trap(regs, 1);
83 }
84
85 static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
86
x86_mce_vector_register(x86_mce_vector_t hdlr)87 void x86_mce_vector_register(x86_mce_vector_t hdlr)
88 {
89 _machine_check_vector = hdlr;
90 }
91
92 /* Call the installed machine check handler for this CPU setup. */
93
do_machine_check(const struct cpu_user_regs * regs)94 void do_machine_check(const struct cpu_user_regs *regs)
95 {
96 mce_enter();
97 _machine_check_vector(regs);
98 mce_exit();
99 }
100
101 /*
102 * Init machine check callback handler
103 * It is used to collect additional information provided by newer
104 * CPU families/models without the need to duplicate the whole handler.
105 * This avoids having many handlers doing almost nearly the same and each
106 * with its own tweaks ands bugs.
107 */
108 static x86_mce_callback_t mc_callback_bank_extended = NULL;
109
x86_mce_callback_register(x86_mce_callback_t cbfunc)110 void x86_mce_callback_register(x86_mce_callback_t cbfunc)
111 {
112 mc_callback_bank_extended = cbfunc;
113 }
114
115 /*
116 * Machine check recoverable judgement callback handler
117 * It is used to judge whether an UC error is recoverable by software
118 */
119 static mce_recoverable_t mc_recoverable_scan = NULL;
120
mce_recoverable_register(mce_recoverable_t cbfunc)121 void mce_recoverable_register(mce_recoverable_t cbfunc)
122 {
123 mc_recoverable_scan = cbfunc;
124 }
125
mcabanks_alloc(unsigned int nr_mce_banks)126 struct mca_banks *mcabanks_alloc(unsigned int nr_mce_banks)
127 {
128 struct mca_banks *mb;
129
130 mb = xmalloc(struct mca_banks);
131 if ( !mb )
132 return NULL;
133
134 /*
135 * For APs allocations get done by the BSP, i.e. when the bank count may
136 * may not be known yet. A zero bank count is a clear indication of this.
137 */
138 if ( !nr_mce_banks )
139 nr_mce_banks = MCG_CAP_COUNT;
140
141 mb->bank_map = xzalloc_array(unsigned long,
142 BITS_TO_LONGS(nr_mce_banks));
143 if ( !mb->bank_map )
144 {
145 xfree(mb);
146 return NULL;
147 }
148
149 mb->num = nr_mce_banks;
150
151 return mb;
152 }
153
mcabanks_free(struct mca_banks * banks)154 void mcabanks_free(struct mca_banks *banks)
155 {
156 if ( banks == NULL )
157 return;
158 if ( banks->bank_map )
159 xfree(banks->bank_map);
160 xfree(banks);
161 }
162
mcabank_clear(int banknum)163 static void mcabank_clear(int banknum)
164 {
165 uint64_t status;
166
167 status = mca_rdmsr(MSR_IA32_MCx_STATUS(banknum));
168
169 if ( status & MCi_STATUS_ADDRV )
170 mca_wrmsr(MSR_IA32_MCx_ADDR(banknum), 0x0ULL);
171 if ( status & MCi_STATUS_MISCV )
172 mca_wrmsr(MSR_IA32_MCx_MISC(banknum), 0x0ULL);
173
174 mca_wrmsr(MSR_IA32_MCx_STATUS(banknum), 0x0ULL);
175 }
176
177 /*
178 * Judging whether to Clear Machine Check error bank callback handler
179 * According to Intel latest MCA OS Recovery Writer's Guide,
180 * whether the error MCA bank needs to be cleared is decided by the mca_source
181 * and MCi_status bit value.
182 */
183 static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
184
mce_need_clearbank_register(mce_need_clearbank_t cbfunc)185 void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
186 {
187 mc_need_clearbank_scan = cbfunc;
188 }
189
190 /*
191 * mce_logout_lock should only be used in the trap handler,
192 * while MCIP has not been cleared yet in the global status
193 * register. Other use is not safe, since an MCE trap can
194 * happen at any moment, which would cause lock recursion.
195 */
196 static DEFINE_SPINLOCK(mce_logout_lock);
197
198 const struct mca_error_handler *__read_mostly mce_dhandlers;
199 const struct mca_error_handler *__read_mostly mce_uhandlers;
200 unsigned int __read_mostly mce_dhandler_num;
201 unsigned int __read_mostly mce_uhandler_num;
202
mca_init_bank(enum mca_source who,struct mc_info * mi,int bank)203 static void mca_init_bank(enum mca_source who, struct mc_info *mi, int bank)
204 {
205 struct mcinfo_bank *mib;
206
207 if ( !mi )
208 return;
209
210 mib = x86_mcinfo_reserve(mi, sizeof(*mib), MC_TYPE_BANK);
211 if ( !mib )
212 {
213 mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
214 return;
215 }
216
217 mib->mc_status = mca_rdmsr(MSR_IA32_MCx_STATUS(bank));
218
219 mib->mc_bank = bank;
220 mib->mc_domid = DOMID_INVALID;
221
222 if ( mib->mc_status & MCi_STATUS_MISCV )
223 mib->mc_misc = mca_rdmsr(MSR_IA32_MCx_MISC(bank));
224
225 if ( mib->mc_status & MCi_STATUS_ADDRV )
226 mib->mc_addr = mca_rdmsr(MSR_IA32_MCx_ADDR(bank));
227
228 if ( (mib->mc_status & MCi_STATUS_MISCV) &&
229 (mib->mc_status & MCi_STATUS_ADDRV) &&
230 (mc_check_addr(mib->mc_status, mib->mc_misc, MC_ADDR_PHYSICAL)) &&
231 (who == MCA_POLLER || who == MCA_CMCI_HANDLER) &&
232 (mfn_valid(_mfn(paddr_to_pfn(mib->mc_addr)))) )
233 {
234 struct domain *d;
235
236 d = maddr_get_owner(mib->mc_addr);
237 if ( d )
238 mib->mc_domid = d->domain_id;
239 }
240
241 if ( who == MCA_CMCI_HANDLER )
242 {
243 mib->mc_ctrl2 = mca_rdmsr(MSR_IA32_MC0_CTL2 + bank);
244 mib->mc_tsc = rdtsc();
245 }
246 }
247
mca_init_global(uint32_t flags,struct mcinfo_global * mig)248 static int mca_init_global(uint32_t flags, struct mcinfo_global *mig)
249 {
250 uint64_t status;
251 int cpu_nr;
252 const struct vcpu *curr = current;
253
254 /* Set global information */
255 status = mca_rdmsr(MSR_IA32_MCG_STATUS);
256 mig->mc_gstatus = status;
257 mig->mc_domid = DOMID_INVALID;
258 mig->mc_vcpuid = XEN_MC_VCPUID_INVALID;
259 mig->mc_flags = flags;
260 cpu_nr = smp_processor_id();
261 /* Retrieve detector information */
262 x86_mc_get_cpu_info(cpu_nr, &mig->mc_socketid,
263 &mig->mc_coreid, &mig->mc_core_threadid,
264 &mig->mc_apicid, NULL, NULL, NULL);
265
266 if ( curr )
267 {
268 mig->mc_domid = curr->domain->domain_id;
269 mig->mc_vcpuid = curr->vcpu_id;
270 }
271
272 return 0;
273 }
274
275 /*
276 * Utility function to perform MCA bank telemetry readout and to push that
277 * telemetry towards an interested dom0 for logging and diagnosis.
278 * The caller - #MC handler or MCA poll function - must arrange that we
279 * do not migrate cpus.
280 */
281
282 /* XXFM Could add overflow counting? */
283
284 /*
285 * Add out_param clear_bank for Machine Check Handler Caller.
286 * For Intel latest CPU, whether to clear the error bank status needs to
287 * be judged by the callback function defined above.
288 */
289 mctelem_cookie_t
mcheck_mca_logout(enum mca_source who,struct mca_banks * bankmask,struct mca_summary * sp,struct mca_banks * clear_bank)290 mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask,
291 struct mca_summary *sp, struct mca_banks *clear_bank)
292 {
293 uint64_t gstatus, status;
294 struct mcinfo_global *mig = NULL; /* on stack */
295 mctelem_cookie_t mctc = NULL;
296 bool uc = false, pcc = false, recover = true, need_clear = true;
297 uint32_t mc_flags = 0;
298 struct mc_info *mci = NULL;
299 mctelem_class_t which = MC_URGENT; /* XXXgcc */
300 int errcnt = 0;
301 int i;
302
303 gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
304 switch ( who )
305 {
306 case MCA_MCE_SCAN:
307 mc_flags = MC_FLAG_MCE;
308 which = MC_URGENT;
309 break;
310
311 case MCA_POLLER:
312 case MCA_RESET:
313 mc_flags = MC_FLAG_POLLED;
314 which = MC_NONURGENT;
315 break;
316
317 case MCA_CMCI_HANDLER:
318 mc_flags = MC_FLAG_CMCI;
319 which = MC_NONURGENT;
320 break;
321
322 default:
323 BUG();
324 }
325
326 /*
327 * If no mc_recovery_scan callback handler registered,
328 * this error is not recoverable
329 */
330 recover = mc_recoverable_scan ? 1 : 0;
331
332 for ( i = 0; i < this_cpu(nr_mce_banks); i++ )
333 {
334 /* Skip bank if corresponding bit in bankmask is clear */
335 if ( !mcabanks_test(i, bankmask) )
336 continue;
337
338 status = mca_rdmsr(MSR_IA32_MCx_STATUS(i));
339 if ( !(status & MCi_STATUS_VAL) )
340 continue; /* this bank has no valid telemetry */
341
342 /*
343 * For Intel Latest CPU CMCI/MCE Handler caller, we need to
344 * decide whether to clear bank by MCi_STATUS bit value such as
345 * OVER/UC/EN/PCC/S/AR
346 */
347 if ( mc_need_clearbank_scan )
348 need_clear = mc_need_clearbank_scan(who, status);
349
350 /*
351 * If this is the first bank with valid MCA DATA, then
352 * try to reserve an entry from the urgent/nonurgent queue
353 * depending on whether we are called from an exception or
354 * a poller; this can fail (for example dom0 may not
355 * yet have consumed past telemetry).
356 */
357 if ( errcnt++ == 0 )
358 {
359 mctc = mctelem_reserve(which);
360 if ( mctc )
361 {
362 mci = mctelem_dataptr(mctc);
363 mcinfo_clear(mci);
364 mig = x86_mcinfo_reserve(mci, sizeof(*mig), MC_TYPE_GLOBAL);
365 /* mc_info should at least hold up the global information */
366 ASSERT(mig);
367 mca_init_global(mc_flags, mig);
368 /* A hook here to get global extended msrs */
369 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
370 intel_get_extended_msrs(mig, mci);
371 }
372 }
373
374 /* flag for uncorrected errors */
375 if ( !uc && ((status & MCi_STATUS_UC) != 0) )
376 uc = true;
377
378 /* flag processor context corrupt */
379 if ( !pcc && ((status & MCi_STATUS_PCC) != 0) )
380 pcc = true;
381
382 if ( recover && uc )
383 /* uc = true, recover = true, we need not panic. */
384 recover = mc_recoverable_scan(status);
385
386 mca_init_bank(who, mci, i);
387
388 if ( mc_callback_bank_extended )
389 mc_callback_bank_extended(mci, i, status);
390
391 /* By default, need_clear = true */
392 if ( who != MCA_MCE_SCAN && need_clear )
393 /* Clear bank */
394 mcabank_clear(i);
395 else if ( who == MCA_MCE_SCAN && need_clear )
396 mcabanks_set(i, clear_bank);
397 }
398
399 if ( mig && errcnt > 0 )
400 {
401 if ( pcc )
402 mig->mc_flags |= MC_FLAG_UNCORRECTABLE;
403 else if ( uc )
404 mig->mc_flags |= MC_FLAG_RECOVERABLE;
405 else
406 mig->mc_flags |= MC_FLAG_CORRECTABLE;
407 }
408
409 if ( sp )
410 {
411 sp->errcnt = errcnt;
412 sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
413 sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
414 sp->lmce = (gstatus & MCG_STATUS_LMCE) != 0;
415 sp->uc = uc;
416 sp->pcc = pcc;
417 sp->recoverable = recover;
418 }
419
420 return mci != NULL ? mctc : NULL; /* may be NULL */
421 }
422
mce_spin_lock(spinlock_t * lk)423 static void mce_spin_lock(spinlock_t *lk)
424 {
425 while ( !spin_trylock(lk) )
426 {
427 cpu_relax();
428 mce_panic_check();
429 }
430 }
431
mce_spin_unlock(spinlock_t * lk)432 static void mce_spin_unlock(spinlock_t *lk)
433 {
434 spin_unlock(lk);
435 }
436
437 static enum mce_result mce_action(const struct cpu_user_regs *regs,
438 mctelem_cookie_t mctc);
439
440 /*
441 * Return:
442 * -1: if system can't be recovered
443 * 0: Continue to next step
444 */
mce_urgent_action(const struct cpu_user_regs * regs,mctelem_cookie_t mctc)445 static int mce_urgent_action(const struct cpu_user_regs *regs,
446 mctelem_cookie_t mctc)
447 {
448 uint64_t gstatus;
449
450 if ( mctc == NULL )
451 return 0;
452
453 gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
454
455 /*
456 * FIXME: When RIPV = EIPV = 0, it's a little bit tricky. It may be an
457 * asynchronic error, currently we have no way to precisely locate
458 * whether the error occur at guest or hypervisor.
459 * To avoid handling error in wrong way, we treat it as unrecovered.
460 *
461 * Another unrecovered case is RIPV = 0 while in hypervisor
462 * since Xen is not pre-emptible.
463 */
464 if ( !(gstatus & MCG_STATUS_RIPV) &&
465 (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)) )
466 return -1;
467
468 return mce_action(regs, mctc) == MCER_RESET ? -1 : 0;
469 }
470
471 /* Shared #MC handler. */
mcheck_cmn_handler(const struct cpu_user_regs * regs)472 void mcheck_cmn_handler(const struct cpu_user_regs *regs)
473 {
474 static DEFINE_MCE_BARRIER(mce_trap_bar);
475 static atomic_t severity_cpu = ATOMIC_INIT(-1);
476 static atomic_t found_error = ATOMIC_INIT(0);
477 static cpumask_t mce_fatal_cpus;
478 struct mca_banks *bankmask = mca_allbanks;
479 unsigned int cpu = smp_processor_id();
480 struct mca_banks *clear_bank = per_cpu(mce_clear_banks, cpu);
481 uint64_t gstatus;
482 mctelem_cookie_t mctc = NULL;
483 struct mca_summary bs;
484 bool bcast, lmce;
485
486 mce_spin_lock(&mce_logout_lock);
487
488 if ( clear_bank != NULL )
489 memset(clear_bank->bank_map, 0x0,
490 sizeof(long) * BITS_TO_LONGS(clear_bank->num));
491 mctc = mcheck_mca_logout(MCA_MCE_SCAN, bankmask, &bs, clear_bank);
492 lmce = bs.lmce;
493 bcast = mce_broadcast && !lmce;
494
495 if ( bs.errcnt )
496 {
497 /*
498 * Uncorrected errors must be dealt with in softirq context.
499 */
500 if ( bs.uc || bs.pcc )
501 {
502 add_taint(TAINT_MACHINE_CHECK);
503 if ( mctc )
504 mctelem_defer(mctc, lmce);
505 /*
506 * For PCC=1 and can't be recovered, context is lost, so
507 * reboot now without clearing the banks, and deal with
508 * the telemetry after reboot (the MSRs are sticky)
509 */
510 if ( bs.pcc || !bs.recoverable )
511 cpumask_set_cpu(cpu, &mce_fatal_cpus);
512 }
513 else if ( mctc != NULL )
514 mctelem_commit(mctc);
515 atomic_set(&found_error, 1);
516
517 /* The last CPU will be take check/clean-up etc */
518 atomic_set(&severity_cpu, cpu);
519
520 mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%u\n",
521 *((unsigned long *)clear_bank), cpu);
522 if ( clear_bank != NULL )
523 mcheck_mca_clearbanks(clear_bank);
524 }
525 else if ( mctc != NULL )
526 mctelem_dismiss(mctc);
527 mce_spin_unlock(&mce_logout_lock);
528
529 mce_barrier_enter(&mce_trap_bar, bcast);
530 if ( mctc != NULL && mce_urgent_action(regs, mctc) )
531 cpumask_set_cpu(cpu, &mce_fatal_cpus);
532 mce_barrier_exit(&mce_trap_bar, bcast);
533
534 /*
535 * Wait until everybody has processed the trap.
536 */
537 mce_barrier_enter(&mce_trap_bar, bcast);
538 if ( lmce || atomic_read(&severity_cpu) == cpu )
539 {
540 /*
541 * According to SDM, if no error bank found on any cpus,
542 * something unexpected happening, we can't do any
543 * recovery job but to reset the system.
544 */
545 if ( atomic_read(&found_error) == 0 )
546 mc_panic("MCE: No CPU found valid MCE, need reset");
547 if ( !cpumask_empty(&mce_fatal_cpus) )
548 {
549 char ebuf[96];
550
551 snprintf(ebuf, sizeof(ebuf),
552 "MCE: Fatal error happened on CPUs %*pb",
553 CPUMASK_PR(&mce_fatal_cpus));
554
555 mc_panic(ebuf);
556 }
557 atomic_set(&found_error, 0);
558 atomic_set(&severity_cpu, -1);
559 }
560 mce_barrier_exit(&mce_trap_bar, bcast);
561
562 /* Clear flags after above fatal check */
563 mce_barrier_enter(&mce_trap_bar, bcast);
564 gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
565 if ( (gstatus & MCG_STATUS_MCIP) != 0 )
566 {
567 mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
568 mca_wrmsr(MSR_IA32_MCG_STATUS, 0);
569 }
570 mce_barrier_exit(&mce_trap_bar, bcast);
571
572 raise_softirq(MACHINE_CHECK_SOFTIRQ);
573 }
574
mcheck_mca_clearbanks(struct mca_banks * bankmask)575 void mcheck_mca_clearbanks(struct mca_banks *bankmask)
576 {
577 int i;
578
579 for ( i = 0; i < this_cpu(nr_mce_banks); i++ )
580 {
581 if ( !mcabanks_test(i, bankmask) )
582 continue;
583 mcabank_clear(i);
584 }
585 }
586
587 /*check the existence of Machine Check*/
mce_available(const struct cpuinfo_x86 * c)588 bool mce_available(const struct cpuinfo_x86 *c)
589 {
590 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
591 }
592
593 /*
594 * Check if bank 0 is usable for MCE. It isn't for Intel P6 family
595 * before model 0x1a.
596 */
mce_firstbank(struct cpuinfo_x86 * c)597 unsigned int mce_firstbank(struct cpuinfo_x86 *c)
598 {
599 return c->x86 == 6 &&
600 c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a;
601 }
602
show_mca_info(int inited,struct cpuinfo_x86 * c)603 int show_mca_info(int inited, struct cpuinfo_x86 *c)
604 {
605 static enum mcheck_type g_type = mcheck_unset;
606
607 if ( inited != g_type )
608 {
609 char prefix[20];
610 static const char *const type_str[] = {
611 [mcheck_amd_famXX] = "AMD",
612 [mcheck_amd_k8] = "AMD K8",
613 [mcheck_intel] = "Intel",
614 [mcheck_hygon] = "Hygon"
615 };
616
617 snprintf(prefix, ARRAY_SIZE(prefix), "%sCPU%u: ",
618 g_type != mcheck_unset ? XENLOG_WARNING : XENLOG_INFO,
619 smp_processor_id());
620 BUG_ON(inited >= ARRAY_SIZE(type_str));
621 switch ( inited )
622 {
623 default:
624 printk("%s%s machine check reporting enabled\n",
625 prefix, type_str[inited]);
626 break;
627
628 case mcheck_amd_famXX:
629 case mcheck_hygon:
630 printk("%s%s Fam%xh machine check reporting enabled\n",
631 prefix, type_str[inited], c->x86);
632 break;
633
634 case mcheck_none:
635 printk("%sNo machine check initialization\n", prefix);
636 break;
637 }
638 g_type = inited;
639 }
640
641 return 0;
642 }
643
set_poll_bankmask(struct cpuinfo_x86 * c)644 static void set_poll_bankmask(struct cpuinfo_x86 *c)
645 {
646 int cpu = smp_processor_id();
647 struct mca_banks *mb;
648
649 mb = per_cpu(poll_bankmask, cpu);
650 BUG_ON(!mb);
651
652 if ( cmci_support && opt_mce )
653 {
654 const struct mca_banks *cmci = per_cpu(no_cmci_banks, cpu);
655
656 if ( unlikely(cmci->num < mb->num) )
657 bitmap_fill(mb->bank_map, mb->num);
658 bitmap_copy(mb->bank_map, cmci->bank_map, min(mb->num, cmci->num));
659 }
660 else
661 {
662 bitmap_copy(mb->bank_map, mca_allbanks->bank_map,
663 per_cpu(nr_mce_banks, cpu));
664 if ( mce_firstbank(c) )
665 mcabanks_clear(0, mb);
666 }
667 }
668
669 /* The perbank ctl/status init is platform specific because of AMD's quirk */
mca_cap_init(void)670 static int mca_cap_init(void)
671 {
672 uint64_t msr_content;
673 unsigned int nr, cpu = smp_processor_id();
674
675 rdmsrl(MSR_IA32_MCG_CAP, msr_content);
676
677 if ( msr_content & MCG_CTL_P ) /* Control register present ? */
678 wrmsrl(MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
679
680 per_cpu(nr_mce_banks, cpu) = nr = MASK_EXTR(msr_content, MCG_CAP_COUNT);
681
682 if ( !nr )
683 {
684 printk(XENLOG_INFO
685 "CPU%u: No MCE banks present. Machine check support disabled\n",
686 cpu);
687 return -ENODEV;
688 }
689
690 /* mcabanks_alloc depends on nr_mce_banks */
691 if ( !mca_allbanks || nr > mca_allbanks->num )
692 {
693 unsigned int i;
694 struct mca_banks *all = mcabanks_alloc(nr);
695
696 if ( !all )
697 return -ENOMEM;
698 for ( i = 0; i < nr; i++ )
699 mcabanks_set(i, mca_allbanks);
700 mcabanks_free(xchg(&mca_allbanks, all));
701 }
702
703 return 0;
704 }
705
cpu_bank_free(unsigned int cpu)706 static void cpu_bank_free(unsigned int cpu)
707 {
708 struct mca_banks *poll = per_cpu(poll_bankmask, cpu);
709 struct mca_banks *clr = per_cpu(mce_clear_banks, cpu);
710
711 mcabanks_free(poll);
712 mcabanks_free(clr);
713
714 per_cpu(poll_bankmask, cpu) = NULL;
715 per_cpu(mce_clear_banks, cpu) = NULL;
716 }
717
cpu_bank_alloc(unsigned int cpu)718 static int cpu_bank_alloc(unsigned int cpu)
719 {
720 unsigned int nr = per_cpu(nr_mce_banks, cpu);
721 struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc(nr);
722 struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc(nr);
723
724 if ( !poll || !clr )
725 {
726 mcabanks_free(poll);
727 mcabanks_free(clr);
728 return -ENOMEM;
729 }
730
731 per_cpu(poll_bankmask, cpu) = poll;
732 per_cpu(mce_clear_banks, cpu) = clr;
733 return 0;
734 }
735
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)736 static int cpu_callback(
737 struct notifier_block *nfb, unsigned long action, void *hcpu)
738 {
739 unsigned int cpu = (unsigned long)hcpu;
740 int rc = 0;
741
742 switch ( action )
743 {
744 case CPU_UP_PREPARE:
745 rc = cpu_bank_alloc(cpu);
746 break;
747
748 case CPU_UP_CANCELED:
749 case CPU_DEAD:
750 if ( !park_offline_cpus )
751 cpu_bank_free(cpu);
752 break;
753
754 case CPU_REMOVE:
755 if ( park_offline_cpus )
756 cpu_bank_free(cpu);
757 break;
758 }
759
760 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
761 }
762
763 static struct notifier_block cpu_nfb = {
764 .notifier_call = cpu_callback
765 };
766
767 /* This has to be run for each processor */
mcheck_init(struct cpuinfo_x86 * c,bool bsp)768 void mcheck_init(struct cpuinfo_x86 *c, bool bsp)
769 {
770 enum mcheck_type inited = mcheck_none;
771 unsigned int cpu = smp_processor_id();
772
773 if ( !opt_mce )
774 {
775 if ( bsp )
776 printk(XENLOG_INFO "MCE support disabled by bootparam\n");
777 return;
778 }
779
780 if ( !mce_available(c) )
781 {
782 printk(XENLOG_INFO "CPU%i: No machine check support available\n", cpu);
783 return;
784 }
785
786 /*Hardware Enable */
787 if ( mca_cap_init() )
788 return;
789
790 if ( !bsp )
791 {
792 per_cpu(poll_bankmask, cpu)->num = per_cpu(nr_mce_banks, cpu);
793 per_cpu(mce_clear_banks, cpu)->num = per_cpu(nr_mce_banks, cpu);
794 }
795 else if ( cpu_bank_alloc(cpu) )
796 panic("Insufficient memory for MCE bank allocations\n");
797
798 switch ( c->x86_vendor )
799 {
800 case X86_VENDOR_AMD:
801 case X86_VENDOR_HYGON:
802 inited = amd_mcheck_init(c);
803 break;
804
805 case X86_VENDOR_INTEL:
806 switch ( c->x86 )
807 {
808 case 6:
809 case 15:
810 inited = intel_mcheck_init(c, bsp);
811 break;
812 }
813 break;
814
815 default:
816 break;
817 }
818
819 show_mca_info(inited, c);
820 if ( inited == mcheck_none || inited == mcheck_unset )
821 goto out;
822
823 intpose_init();
824
825 if ( bsp )
826 {
827 mctelem_init(sizeof(struct mc_info));
828 register_cpu_notifier(&cpu_nfb);
829 }
830
831 /* Turn on MCE now */
832 set_in_cr4(X86_CR4_MCE);
833
834 set_poll_bankmask(c);
835
836 return;
837 out:
838 if ( bsp )
839 {
840 cpu_bank_free(smp_processor_id());
841 mcabanks_free(mca_allbanks);
842 mca_allbanks = NULL;
843 }
844 }
845
mcinfo_clear(struct mc_info * mi)846 static void mcinfo_clear(struct mc_info *mi)
847 {
848 memset(mi, 0, sizeof(struct mc_info));
849 x86_mcinfo_nentries(mi) = 0;
850 }
851
x86_mcinfo_reserve(struct mc_info * mi,unsigned int size,unsigned int type)852 void *x86_mcinfo_reserve(struct mc_info *mi,
853 unsigned int size, unsigned int type)
854 {
855 int i;
856 unsigned long end1, end2;
857 struct mcinfo_common *mic_base, *mic_index;
858
859 mic_index = mic_base = x86_mcinfo_first(mi);
860
861 /* go to first free entry */
862 for ( i = 0; i < x86_mcinfo_nentries(mi); i++ )
863 mic_index = x86_mcinfo_next(mic_index);
864
865 /* check if there is enough size */
866 end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
867 end2 = (unsigned long)((uint8_t *)mic_index + size);
868
869 if ( end1 < end2 )
870 {
871 mce_printk(MCE_CRITICAL,
872 "mcinfo_add: No space left in mc_info\n");
873 return NULL;
874 }
875
876 /* there's enough space. add entry. */
877 x86_mcinfo_nentries(mi)++;
878
879 memset(mic_index, 0, size);
880 mic_index->size = size;
881 mic_index->type = type;
882
883 return mic_index;
884 }
885
x86_mcinfo_apei_save(struct mcinfo_global * mc_global,struct mcinfo_bank * mc_bank)886 static void x86_mcinfo_apei_save(
887 struct mcinfo_global *mc_global, struct mcinfo_bank *mc_bank)
888 {
889 struct mce m;
890
891 memset(&m, 0, sizeof(struct mce));
892
893 m.cpu = mc_global->mc_coreid;
894 m.cpuvendor = boot_cpu_data.x86_vendor;
895 m.cpuid = cpuid_eax(1);
896 m.socketid = mc_global->mc_socketid;
897 m.apicid = mc_global->mc_apicid;
898
899 m.mcgstatus = mc_global->mc_gstatus;
900 m.status = mc_bank->mc_status;
901 m.misc = mc_bank->mc_misc;
902 m.addr = mc_bank->mc_addr;
903 m.bank = mc_bank->mc_bank;
904
905 apei_write_mce(&m);
906 }
907
908 /*
909 * Dump machine check information in a format,
910 * mcelog can parse. This is used only when
911 * Dom0 does not take the notification.
912 */
x86_mcinfo_dump(struct mc_info * mi)913 void x86_mcinfo_dump(struct mc_info *mi)
914 {
915 struct mcinfo_common *mic = NULL;
916 struct mcinfo_global *mc_global;
917 struct mcinfo_bank *mc_bank;
918
919 /* first print the global info */
920 x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
921 if ( mic == NULL )
922 return;
923 mc_global = (struct mcinfo_global *)mic;
924 if ( mc_global->mc_flags & MC_FLAG_MCE )
925 printk(XENLOG_WARNING
926 "CPU%d: Machine Check Exception: %16"PRIx64"\n",
927 mc_global->mc_coreid, mc_global->mc_gstatus);
928 else if ( mc_global->mc_flags & MC_FLAG_CMCI )
929 printk(XENLOG_WARNING "CMCI occurred on CPU %d.\n",
930 mc_global->mc_coreid);
931 else if ( mc_global->mc_flags & MC_FLAG_POLLED )
932 printk(XENLOG_WARNING "POLLED occurred on CPU %d.\n",
933 mc_global->mc_coreid);
934
935 /* then the bank information */
936 x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
937 do {
938 if ( mic == NULL )
939 return;
940 if ( mic->type != MC_TYPE_BANK )
941 goto next;
942
943 mc_bank = (struct mcinfo_bank *)mic;
944
945 printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
946 mc_bank->mc_bank,
947 mc_bank->mc_status);
948 if ( mc_bank->mc_status & MCi_STATUS_MISCV )
949 printk("[%16"PRIx64"]", mc_bank->mc_misc);
950 if ( mc_bank->mc_status & MCi_STATUS_ADDRV )
951 printk(" at %16"PRIx64, mc_bank->mc_addr);
952 printk("\n");
953
954 if ( is_mc_panic )
955 x86_mcinfo_apei_save(mc_global, mc_bank);
956
957 next:
958 mic = x86_mcinfo_next(mic); /* next entry */
959 if ( (mic == NULL) || (mic->size == 0) )
960 break;
961 } while ( 1 );
962 }
963
do_mc_get_cpu_info(void * v)964 static void do_mc_get_cpu_info(void *v)
965 {
966 int cpu = smp_processor_id();
967 int cindex, cpn;
968 struct cpuinfo_x86 *c;
969 xen_mc_logical_cpu_t *log_cpus, *xcp;
970 uint32_t junk, ebx;
971
972 log_cpus = v;
973 c = &cpu_data[cpu];
974 cindex = 0;
975 cpn = cpu - 1;
976
977 /*
978 * Deal with sparse masks, condensed into a contig array.
979 */
980 while ( cpn >= 0 )
981 {
982 if ( cpu_online(cpn) )
983 cindex++;
984 cpn--;
985 }
986
987 xcp = &log_cpus[cindex];
988 c = &cpu_data[cpu];
989 xcp->mc_cpunr = cpu;
990 x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
991 &xcp->mc_coreid, &xcp->mc_threadid,
992 &xcp->mc_apicid, &xcp->mc_ncores,
993 &xcp->mc_ncores_active, &xcp->mc_nthreads);
994 xcp->mc_cpuid_level = c->cpuid_level;
995 xcp->mc_family = c->x86;
996 xcp->mc_vendor = c->x86_vendor;
997 xcp->mc_model = c->x86_model;
998 xcp->mc_step = c->x86_mask;
999 xcp->mc_cache_size = c->x86_cache_size;
1000 xcp->mc_cache_alignment = c->x86_cache_alignment;
1001 memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
1002 memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
1003 memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
1004
1005 /*
1006 * This part needs to run on the CPU itself.
1007 */
1008 xcp->mc_nmsrvals = 1;
1009 xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
1010 rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
1011
1012 if ( ppin_msr && xcp->mc_nmsrvals < ARRAY_SIZE(xcp->mc_msrvalues) )
1013 {
1014 xcp->mc_msrvalues[xcp->mc_nmsrvals].reg = ppin_msr;
1015 rdmsrl(ppin_msr, xcp->mc_msrvalues[xcp->mc_nmsrvals].value);
1016 ++xcp->mc_nmsrvals;
1017 }
1018
1019 if ( c->cpuid_level >= 1 )
1020 {
1021 cpuid(1, &junk, &ebx, &junk, &junk);
1022 xcp->mc_clusterid = (ebx >> 24) & 0xff;
1023 }
1024 else
1025 xcp->mc_clusterid = get_apic_id();
1026 }
1027
x86_mc_get_cpu_info(unsigned cpu,uint32_t * chipid,uint16_t * coreid,uint16_t * threadid,uint32_t * apicid,unsigned * ncores,unsigned * ncores_active,unsigned * nthreads)1028 void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
1029 uint16_t *threadid, uint32_t *apicid,
1030 unsigned *ncores, unsigned *ncores_active,
1031 unsigned *nthreads)
1032 {
1033 struct cpuinfo_x86 *c;
1034
1035 *apicid = cpu_physical_id(cpu);
1036 c = &cpu_data[cpu];
1037 if ( c->apicid == BAD_APICID )
1038 {
1039 *chipid = cpu;
1040 *coreid = 0;
1041 *threadid = 0;
1042 if ( ncores != NULL )
1043 *ncores = 1;
1044 if ( ncores_active != NULL )
1045 *ncores_active = 1;
1046 if ( nthreads != NULL )
1047 *nthreads = 1;
1048 }
1049 else
1050 {
1051 *chipid = c->phys_proc_id;
1052 if ( c->x86_max_cores > 1 )
1053 *coreid = c->cpu_core_id;
1054 else
1055 *coreid = 0;
1056 *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
1057 if ( ncores != NULL )
1058 *ncores = c->x86_max_cores;
1059 if ( ncores_active != NULL )
1060 *ncores_active = c->booted_cores;
1061 if ( nthreads != NULL )
1062 *nthreads = c->x86_num_siblings;
1063 }
1064 }
1065
1066 #define INTPOSE_NENT 50
1067
1068 static struct intpose_ent {
1069 unsigned int cpu_nr;
1070 uint64_t msr;
1071 uint64_t val;
1072 } intpose_arr[INTPOSE_NENT];
1073
intpose_init(void)1074 static void intpose_init(void)
1075 {
1076 static int done;
1077 int i;
1078
1079 if ( done++ > 0 )
1080 return;
1081
1082 for ( i = 0; i < INTPOSE_NENT; i++ )
1083 intpose_arr[i].cpu_nr = -1;
1084
1085 }
1086
intpose_lookup(unsigned int cpu_nr,uint64_t msr,uint64_t * valp)1087 struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
1088 uint64_t *valp)
1089 {
1090 int i;
1091
1092 for ( i = 0; i < INTPOSE_NENT; i++ )
1093 {
1094 if ( intpose_arr[i].cpu_nr == cpu_nr && intpose_arr[i].msr == msr )
1095 {
1096 if ( valp != NULL )
1097 *valp = intpose_arr[i].val;
1098 return &intpose_arr[i];
1099 }
1100 }
1101
1102 return NULL;
1103 }
1104
intpose_add(unsigned int cpu_nr,uint64_t msr,uint64_t val)1105 static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
1106 {
1107 struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL);
1108 int i;
1109
1110 if ( ent )
1111 {
1112 ent->val = val;
1113 return;
1114 }
1115
1116 for ( i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++ )
1117 {
1118 if ( ent->cpu_nr == -1 )
1119 {
1120 ent->cpu_nr = cpu_nr;
1121 ent->msr = msr;
1122 ent->val = val;
1123 return;
1124 }
1125 }
1126
1127 printk("intpose_add: interpose array full - request dropped\n");
1128 }
1129
intpose_inval(unsigned int cpu_nr,uint64_t msr)1130 bool intpose_inval(unsigned int cpu_nr, uint64_t msr)
1131 {
1132 struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL);
1133
1134 if ( !ent )
1135 return false;
1136
1137 ent->cpu_nr = -1;
1138 return true;
1139 }
1140
1141 #define IS_MCA_BANKREG(r, cpu) \
1142 ((r) >= MSR_IA32_MC0_CTL && \
1143 (r) <= MSR_IA32_MCx_MISC(per_cpu(nr_mce_banks, cpu) - 1) && \
1144 ((r) - MSR_IA32_MC0_CTL) % 4) /* excludes MCi_CTL */
1145
x86_mc_msrinject_verify(struct xen_mc_msrinject * mci)1146 static bool x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
1147 {
1148 const struct cpuinfo_x86 *c = &cpu_data[mci->mcinj_cpunr];
1149 int i, errs = 0;
1150
1151 for ( i = 0; i < mci->mcinj_count; i++ )
1152 {
1153 uint64_t reg = mci->mcinj_msr[i].reg;
1154 const char *reason = NULL;
1155
1156 if ( IS_MCA_BANKREG(reg, mci->mcinj_cpunr) )
1157 {
1158 if ( c->x86_vendor == X86_VENDOR_AMD )
1159 {
1160 /*
1161 * On AMD we can set MCi_STATUS_WREN in the
1162 * HWCR MSR to allow non-zero writes to banks
1163 * MSRs not to #GP. The injector in dom0
1164 * should set that bit, but we detect when it
1165 * is necessary and set it as a courtesy to
1166 * avoid #GP in the hypervisor.
1167 */
1168 mci->mcinj_flags |=
1169 _MC_MSRINJ_F_REQ_HWCR_WREN;
1170 continue;
1171 }
1172 else
1173 {
1174 /*
1175 * No alternative but to interpose, so require
1176 * that the injector specified as such.
1177 */
1178 if ( !(mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) )
1179 reason = "must specify interposition";
1180 }
1181 }
1182 else
1183 {
1184 switch ( reg )
1185 {
1186 /* MSRs acceptable on all x86 cpus */
1187 case MSR_IA32_MCG_STATUS:
1188 break;
1189
1190 case MSR_F10_MC4_MISC1:
1191 case MSR_F10_MC4_MISC2:
1192 case MSR_F10_MC4_MISC3:
1193 if ( c->x86_vendor != X86_VENDOR_AMD )
1194 reason = "only supported on AMD";
1195 else if ( c->x86 < 0x10 )
1196 reason = "only supported on AMD Fam10h+";
1197 break;
1198
1199 /* MSRs that the HV will take care of */
1200 case MSR_K8_HWCR:
1201 if ( c->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
1202 reason = "HV will operate HWCR";
1203 else
1204 reason = "only supported on AMD or Hygon";
1205 break;
1206
1207 default:
1208 reason = "not a recognized MCA MSR";
1209 break;
1210 }
1211 }
1212
1213 if ( reason != NULL )
1214 {
1215 printk("HV MSR INJECT ERROR: MSR %#Lx %s\n",
1216 (unsigned long long)mci->mcinj_msr[i].reg, reason);
1217 errs++;
1218 }
1219 }
1220
1221 return !errs;
1222 }
1223
x86_mc_hwcr_wren(void)1224 static uint64_t x86_mc_hwcr_wren(void)
1225 {
1226 uint64_t old;
1227
1228 rdmsrl(MSR_K8_HWCR, old);
1229
1230 if ( !(old & K8_HWCR_MCi_STATUS_WREN) )
1231 {
1232 uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
1233 wrmsrl(MSR_K8_HWCR, new);
1234 }
1235
1236 return old;
1237 }
1238
x86_mc_hwcr_wren_restore(uint64_t hwcr)1239 static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
1240 {
1241 if ( !(hwcr & K8_HWCR_MCi_STATUS_WREN) )
1242 wrmsrl(MSR_K8_HWCR, hwcr);
1243 }
1244
x86_mc_msrinject(void * data)1245 static void x86_mc_msrinject(void *data)
1246 {
1247 struct xen_mc_msrinject *mci = data;
1248 struct mcinfo_msr *msr;
1249 uint64_t hwcr = 0;
1250 int intpose;
1251 int i;
1252
1253 if ( mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN )
1254 hwcr = x86_mc_hwcr_wren();
1255
1256 intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
1257
1258 for ( i = 0, msr = &mci->mcinj_msr[0]; i < mci->mcinj_count; i++, msr++ )
1259 {
1260 printk("HV MSR INJECT (%s) target %u actual %u MSR %#Lx <-- %#Lx\n",
1261 intpose ? "interpose" : "hardware",
1262 mci->mcinj_cpunr, smp_processor_id(),
1263 (unsigned long long)msr->reg,
1264 (unsigned long long)msr->value);
1265
1266 if ( intpose )
1267 intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
1268 else
1269 wrmsrl(msr->reg, msr->value);
1270 }
1271
1272 if ( mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN )
1273 x86_mc_hwcr_wren_restore(hwcr);
1274 }
1275
1276 /*ARGSUSED*/
x86_mc_mceinject(void * data)1277 static void x86_mc_mceinject(void *data)
1278 {
1279 printk("Simulating #MC on cpu %d\n", smp_processor_id());
1280 __asm__ __volatile__("int $0x12");
1281 }
1282
1283 #if BITS_PER_LONG == 64
1284
1285 #define ID2COOKIE(id) ((mctelem_cookie_t)(id))
1286 #define COOKIE2ID(c) ((uint64_t)(c))
1287
1288 #elif defined(BITS_PER_LONG)
1289 #error BITS_PER_LONG has unexpected value
1290 #else
1291 #error BITS_PER_LONG definition absent
1292 #endif
1293
1294 # include <compat/arch-x86/xen-mca.h>
1295
1296 # define xen_mcinfo_msr mcinfo_msr
1297 CHECK_mcinfo_msr;
1298 # undef xen_mcinfo_msr
1299 # undef CHECK_mcinfo_msr
1300 # define CHECK_mcinfo_msr struct mcinfo_msr
1301
1302 # define xen_mcinfo_common mcinfo_common
1303 CHECK_mcinfo_common;
1304 # undef xen_mcinfo_common
1305 # undef CHECK_mcinfo_common
1306 # define CHECK_mcinfo_common struct mcinfo_common
1307
1308 CHECK_FIELD_(struct, mc_fetch, flags);
1309 CHECK_FIELD_(struct, mc_fetch, fetch_id);
1310 # define CHECK_compat_mc_fetch struct mc_fetch
1311
1312 CHECK_FIELD_(struct, mc_physcpuinfo, ncpus);
1313 # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo
1314
1315 #define CHECK_compat_mc_inject_v2 struct mc_inject_v2
1316 CHECK_mc;
1317 # undef CHECK_compat_mc_fetch
1318 # undef CHECK_compat_mc_physcpuinfo
1319
1320 # define xen_mc_info mc_info
1321 CHECK_mc_info;
1322 # undef xen_mc_info
1323
1324 # define xen_mcinfo_global mcinfo_global
1325 CHECK_mcinfo_global;
1326 # undef xen_mcinfo_global
1327
1328 # define xen_mcinfo_bank mcinfo_bank
1329 CHECK_mcinfo_bank;
1330 # undef xen_mcinfo_bank
1331
1332 # define xen_mcinfo_extended mcinfo_extended
1333 CHECK_mcinfo_extended;
1334 # undef xen_mcinfo_extended
1335
1336 # define xen_mcinfo_recovery mcinfo_recovery
1337 # define xen_cpu_offline_action cpu_offline_action
1338 # define xen_page_offline_action page_offline_action
1339 CHECK_mcinfo_recovery;
1340 # undef xen_cpu_offline_action
1341 # undef xen_page_offline_action
1342 # undef xen_mcinfo_recovery
1343
1344 /* Machine Check Architecture Hypercall */
do_mca(XEN_GUEST_HANDLE_PARAM (xen_mc_t)u_xen_mc)1345 long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc)
1346 {
1347 long ret = 0;
1348 struct xen_mc curop, *op = &curop;
1349 struct vcpu *v = current;
1350 union {
1351 struct xen_mc_fetch *nat;
1352 struct compat_mc_fetch *cmp;
1353 } mc_fetch;
1354 union {
1355 struct xen_mc_physcpuinfo *nat;
1356 struct compat_mc_physcpuinfo *cmp;
1357 } mc_physcpuinfo;
1358 uint32_t flags, cmdflags;
1359 int nlcpu;
1360 mctelem_cookie_t mctc;
1361 mctelem_class_t which;
1362 unsigned int target;
1363 struct xen_mc_msrinject *mc_msrinject;
1364 struct xen_mc_mceinject *mc_mceinject;
1365
1366 ret = xsm_do_mca(XSM_PRIV);
1367 if ( ret )
1368 return x86_mcerr("", ret);
1369
1370 if ( copy_from_guest(op, u_xen_mc, 1) )
1371 return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
1372
1373 if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
1374 return x86_mcerr("do_mca: interface version mismatch", -EACCES);
1375
1376 switch ( op->cmd )
1377 {
1378 case XEN_MC_fetch:
1379 mc_fetch.nat = &op->u.mc_fetch;
1380 cmdflags = mc_fetch.nat->flags;
1381
1382 switch ( cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT) )
1383 {
1384 case XEN_MC_NONURGENT:
1385 which = MC_NONURGENT;
1386 break;
1387
1388 case XEN_MC_URGENT:
1389 which = MC_URGENT;
1390 break;
1391
1392 default:
1393 return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
1394 }
1395
1396 flags = XEN_MC_OK;
1397
1398 if ( cmdflags & XEN_MC_ACK )
1399 {
1400 mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
1401 mctelem_ack(which, cookie);
1402 }
1403 else
1404 {
1405 if ( !is_pv_32bit_vcpu(v)
1406 ? guest_handle_is_null(mc_fetch.nat->data)
1407 : compat_handle_is_null(mc_fetch.cmp->data) )
1408 return x86_mcerr("do_mca fetch: guest buffer "
1409 "invalid", -EINVAL);
1410
1411 mctc = mctelem_consume_oldest_begin(which);
1412 if ( mctc )
1413 {
1414 struct mc_info *mcip = mctelem_dataptr(mctc);
1415 if ( !is_pv_32bit_vcpu(v)
1416 ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
1417 : copy_to_compat(mc_fetch.cmp->data, mcip, 1) )
1418 {
1419 ret = -EFAULT;
1420 flags |= XEN_MC_FETCHFAILED;
1421 mc_fetch.nat->fetch_id = 0;
1422 }
1423 else
1424 mc_fetch.nat->fetch_id = COOKIE2ID(mctc);
1425 mctelem_consume_oldest_end(mctc);
1426 }
1427 else
1428 {
1429 /* There is no data */
1430 flags |= XEN_MC_NODATA;
1431 mc_fetch.nat->fetch_id = 0;
1432 }
1433
1434 mc_fetch.nat->flags = flags;
1435 if (copy_to_guest(u_xen_mc, op, 1) != 0)
1436 ret = -EFAULT;
1437 }
1438
1439 break;
1440
1441 case XEN_MC_notifydomain:
1442 return x86_mcerr("do_mca notify unsupported", -EINVAL);
1443
1444 case XEN_MC_physcpuinfo:
1445 mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
1446 nlcpu = num_online_cpus();
1447
1448 if ( !is_pv_32bit_vcpu(v)
1449 ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
1450 : !compat_handle_is_null(mc_physcpuinfo.cmp->info) )
1451 {
1452 xen_mc_logical_cpu_t *log_cpus;
1453
1454 if ( mc_physcpuinfo.nat->ncpus <= 0 )
1455 return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
1456 -EINVAL);
1457 nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus);
1458 log_cpus = xzalloc_array(xen_mc_logical_cpu_t, nlcpu);
1459 if ( log_cpus == NULL )
1460 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
1461 on_each_cpu(do_mc_get_cpu_info, log_cpus, 1);
1462 if ( !is_pv_32bit_vcpu(v)
1463 ? copy_to_guest(mc_physcpuinfo.nat->info, log_cpus, nlcpu)
1464 : copy_to_compat(mc_physcpuinfo.cmp->info, log_cpus, nlcpu) )
1465 ret = -EFAULT;
1466 xfree(log_cpus);
1467 }
1468
1469 mc_physcpuinfo.nat->ncpus = nlcpu;
1470
1471 if ( copy_to_guest(u_xen_mc, op, 1) )
1472 return x86_mcerr("do_mca cpuinfo", -EFAULT);
1473
1474 break;
1475
1476 case XEN_MC_msrinject:
1477 if ( !mca_allbanks || !mca_allbanks->num )
1478 return x86_mcerr("do_mca inject", -ENODEV);
1479
1480 mc_msrinject = &op->u.mc_msrinject;
1481 target = mc_msrinject->mcinj_cpunr;
1482
1483 if ( target >= nr_cpu_ids )
1484 return x86_mcerr("do_mca inject: bad target", -EINVAL);
1485
1486 if ( !cpu_online(target) )
1487 return x86_mcerr("do_mca inject: target offline",
1488 -EINVAL);
1489
1490 if ( !per_cpu(nr_mce_banks, target) )
1491 return x86_mcerr("do_mca inject: no banks", -ENOENT);
1492
1493 if ( mc_msrinject->mcinj_count == 0 )
1494 return 0;
1495
1496 if ( mc_msrinject->mcinj_flags & MC_MSRINJ_F_GPADDR )
1497 {
1498 domid_t domid;
1499 struct domain *d;
1500 struct mcinfo_msr *msr;
1501 unsigned int i;
1502 paddr_t gaddr;
1503 unsigned long gfn, mfn;
1504 p2m_type_t t;
1505
1506 domid = (mc_msrinject->mcinj_domid == DOMID_SELF) ?
1507 current->domain->domain_id : mc_msrinject->mcinj_domid;
1508 if ( domid >= DOMID_FIRST_RESERVED )
1509 return x86_mcerr("do_mca inject: incompatible flag "
1510 "MC_MSRINJ_F_GPADDR with domain %d",
1511 -EINVAL, domid);
1512
1513 d = get_domain_by_id(domid);
1514 if ( d == NULL )
1515 return x86_mcerr("do_mca inject: bad domain id %d",
1516 -EINVAL, domid);
1517
1518 for ( i = 0, msr = &mc_msrinject->mcinj_msr[0];
1519 i < mc_msrinject->mcinj_count;
1520 i++, msr++ )
1521 {
1522 gaddr = msr->value;
1523 gfn = PFN_DOWN(gaddr);
1524 mfn = mfn_x(get_gfn(d, gfn, &t));
1525
1526 if ( mfn == mfn_x(INVALID_MFN) )
1527 {
1528 put_gfn(d, gfn);
1529 put_domain(d);
1530 return x86_mcerr("do_mca inject: bad gfn %#lx of domain %d",
1531 -EINVAL, gfn, domid);
1532 }
1533
1534 msr->value = pfn_to_paddr(mfn) | (gaddr & (PAGE_SIZE - 1));
1535
1536 put_gfn(d, gfn);
1537 }
1538
1539 put_domain(d);
1540 }
1541
1542 if ( !x86_mc_msrinject_verify(mc_msrinject) )
1543 return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
1544
1545 add_taint(TAINT_ERROR_INJECT);
1546
1547 on_selected_cpus(cpumask_of(target), x86_mc_msrinject,
1548 mc_msrinject, 1);
1549
1550 break;
1551
1552 case XEN_MC_mceinject:
1553 if ( !mca_allbanks || !mca_allbanks->num )
1554 return x86_mcerr("do_mca #MC", -ENODEV);
1555
1556 mc_mceinject = &op->u.mc_mceinject;
1557 target = mc_mceinject->mceinj_cpunr;
1558
1559 if ( target >= nr_cpu_ids )
1560 return x86_mcerr("do_mca #MC: bad target", -EINVAL);
1561
1562 if ( !cpu_online(target) )
1563 return x86_mcerr("do_mca #MC: target offline", -EINVAL);
1564
1565 if ( !per_cpu(nr_mce_banks, target) )
1566 return x86_mcerr("do_mca #MC: no banks", -ENOENT);
1567
1568 add_taint(TAINT_ERROR_INJECT);
1569
1570 if ( mce_broadcast )
1571 on_each_cpu(x86_mc_mceinject, mc_mceinject, 1);
1572 else
1573 on_selected_cpus(cpumask_of(target), x86_mc_mceinject,
1574 mc_mceinject, 1);
1575 break;
1576
1577 case XEN_MC_inject_v2:
1578 {
1579 const cpumask_t *cpumap;
1580 cpumask_var_t cmv;
1581 bool broadcast = op->u.mc_inject_v2.flags & XEN_MC_INJECT_CPU_BROADCAST;
1582
1583 if ( !mca_allbanks || !mca_allbanks->num )
1584 return x86_mcerr("do_mca #MC", -ENODEV);
1585
1586 if ( broadcast )
1587 cpumap = &cpu_online_map;
1588 else
1589 {
1590 ret = xenctl_bitmap_to_cpumask(&cmv, &op->u.mc_inject_v2.cpumap);
1591 if ( ret )
1592 break;
1593 cpumap = cmv;
1594 if ( !cpumask_intersects(cpumap, &cpu_online_map) )
1595 {
1596 free_cpumask_var(cmv);
1597 ret = x86_mcerr("No online CPU passed\n", -EINVAL);
1598 break;
1599 }
1600 if ( !cpumask_subset(cpumap, &cpu_online_map) )
1601 dprintk(XENLOG_INFO,
1602 "Not all required CPUs are online\n");
1603 }
1604
1605 for_each_cpu(target, cpumap)
1606 if ( cpu_online(target) && !per_cpu(nr_mce_banks, target) )
1607 {
1608 ret = x86_mcerr("do_mca #MC: CPU%u has no banks",
1609 -ENOENT, target);
1610 break;
1611 }
1612 if ( ret )
1613 break;
1614
1615 switch ( op->u.mc_inject_v2.flags & XEN_MC_INJECT_TYPE_MASK )
1616 {
1617 case XEN_MC_INJECT_TYPE_MCE:
1618 if ( mce_broadcast &&
1619 !cpumask_equal(cpumap, &cpu_online_map) )
1620 printk("Not trigger MCE on all CPUs, may HANG!\n");
1621 on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1);
1622 break;
1623
1624 case XEN_MC_INJECT_TYPE_CMCI:
1625 if ( !cmci_apic_vector )
1626 ret = x86_mcerr("No CMCI supported in platform\n", -EINVAL);
1627 else
1628 {
1629 if ( cpumask_test_cpu(smp_processor_id(), cpumap) )
1630 send_IPI_self(cmci_apic_vector);
1631 send_IPI_mask(cpumap, cmci_apic_vector);
1632 }
1633 break;
1634
1635 case XEN_MC_INJECT_TYPE_LMCE:
1636 if ( !lmce_support )
1637 {
1638 ret = x86_mcerr("No LMCE support", -EINVAL);
1639 break;
1640 }
1641 if ( broadcast )
1642 {
1643 ret = x86_mcerr("Broadcast cannot be used with LMCE", -EINVAL);
1644 break;
1645 }
1646 /* Ensure at most one CPU is specified. */
1647 if ( nr_cpu_ids > cpumask_next(cpumask_first(cpumap), cpumap) )
1648 {
1649 ret = x86_mcerr("More than one CPU specified for LMCE",
1650 -EINVAL);
1651 break;
1652 }
1653 on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1);
1654 break;
1655
1656 default:
1657 ret = x86_mcerr("Wrong mca type\n", -EINVAL);
1658 break;
1659 }
1660
1661 if ( cpumap != &cpu_online_map )
1662 free_cpumask_var(cmv);
1663
1664 break;
1665 }
1666
1667 default:
1668 return x86_mcerr("do_mca: bad command", -EINVAL);
1669 }
1670
1671 return ret;
1672 }
1673
1674 int mcinfo_dumpped;
x86_mcinfo_dump_panic(mctelem_cookie_t mctc)1675 static int x86_mcinfo_dump_panic(mctelem_cookie_t mctc)
1676 {
1677 struct mc_info *mcip = mctelem_dataptr(mctc);
1678
1679 x86_mcinfo_dump(mcip);
1680 mcinfo_dumpped++;
1681
1682 return 0;
1683 }
1684
1685 /* XXX shall we dump commited mc_info?? */
mc_panic_dump(void)1686 static void mc_panic_dump(void)
1687 {
1688 int cpu;
1689
1690 dprintk(XENLOG_ERR, "Begin dump mc_info\n");
1691 for_each_online_cpu(cpu)
1692 mctelem_process_deferred(cpu, x86_mcinfo_dump_panic,
1693 mctelem_has_deferred_lmce(cpu));
1694 dprintk(XENLOG_ERR, "End dump mc_info, %x mcinfo dumped\n", mcinfo_dumpped);
1695 }
1696
mc_panic(char * s)1697 void mc_panic(char *s)
1698 {
1699 is_mc_panic = true;
1700 console_force_unlock();
1701
1702 printk("Fatal machine check: %s\n", s);
1703 printk("\n"
1704 "****************************************\n"
1705 "\n"
1706 " The processor has reported a hardware error which cannot\n"
1707 " be recovered from. Xen will now reboot the machine.\n");
1708 mc_panic_dump();
1709 panic("HARDWARE ERROR\n");
1710 }
1711
1712 /*
1713 * Machine Check owner judge algorithm:
1714 * When error happens, all cpus serially read its msr banks.
1715 * The first CPU who fetches the error bank's info will clear
1716 * this bank. Later readers can't get any information again.
1717 * The first CPU is the actual mce_owner
1718 *
1719 * For Fatal (pcc=1) error, it might cause machine crash
1720 * before we're able to log. For avoiding log missing, we adopt two
1721 * round scanning:
1722 * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
1723 * All MCE banks are sticky, when boot up, MCE polling mechanism
1724 * will help to collect and log those MCE errors.
1725 * Round2: Do all MCE processing logic as normal.
1726 */
1727
1728 /* Maybe called in MCE context, no lock, no printk */
mce_action(const struct cpu_user_regs * regs,mctelem_cookie_t mctc)1729 static enum mce_result mce_action(const struct cpu_user_regs *regs,
1730 mctelem_cookie_t mctc)
1731 {
1732 struct mc_info *local_mi;
1733 enum mce_result bank_result = MCER_NOERROR;
1734 enum mce_result worst_result = MCER_NOERROR;
1735 struct mcinfo_common *mic = NULL;
1736 struct mca_binfo binfo;
1737 const struct mca_error_handler *handlers = mce_dhandlers;
1738 unsigned int i, handler_num = mce_dhandler_num;
1739
1740 /* When in mce context, regs is valid */
1741 if ( regs )
1742 {
1743 handler_num = mce_uhandler_num;
1744 handlers = mce_uhandlers;
1745 }
1746
1747 local_mi = (struct mc_info *)mctelem_dataptr(mctc);
1748 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
1749 if ( mic == NULL )
1750 {
1751 printk(KERN_ERR "MCE: get local buffer entry failed\n ");
1752 return MCER_CONTINUE;
1753 }
1754
1755 memset(&binfo, 0, sizeof(binfo));
1756 binfo.mig = (struct mcinfo_global *)mic;
1757 binfo.mi = local_mi;
1758
1759 /* Processing bank information */
1760 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
1761
1762 for ( ; bank_result != MCER_RESET && mic && mic->size;
1763 mic = x86_mcinfo_next(mic) )
1764 {
1765 if ( mic->type != MC_TYPE_BANK )
1766 {
1767 continue;
1768 }
1769 binfo.mib = (struct mcinfo_bank *)mic;
1770 binfo.bank = binfo.mib->mc_bank;
1771 bank_result = MCER_NOERROR;
1772 for ( i = 0; i < handler_num; i++ )
1773 {
1774 if ( handlers[i].owned_error(binfo.mib->mc_status) )
1775 {
1776 handlers[i].recovery_handler(&binfo, &bank_result, regs);
1777 if ( worst_result < bank_result )
1778 worst_result = bank_result;
1779 break;
1780 }
1781 }
1782 }
1783
1784 return worst_result;
1785 }
1786
1787 /*
1788 * Called from mctelem_process_deferred. Return 1 if the telemetry
1789 * should be committed for dom0 consumption, 0 if it should be
1790 * dismissed.
1791 */
mce_delayed_action(mctelem_cookie_t mctc)1792 static int mce_delayed_action(mctelem_cookie_t mctc)
1793 {
1794 enum mce_result result;
1795 int ret = 0;
1796
1797 result = mce_action(NULL, mctc);
1798
1799 switch ( result )
1800 {
1801 case MCER_RESET:
1802 dprintk(XENLOG_ERR, "MCE delayed action failed\n");
1803 is_mc_panic = true;
1804 x86_mcinfo_dump(mctelem_dataptr(mctc));
1805 panic("MCE: Software recovery failed for the UCR\n");
1806 break;
1807
1808 case MCER_RECOVERED:
1809 dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
1810 ret = 1;
1811 break;
1812
1813 case MCER_CONTINUE:
1814 dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
1815 "system is tainted\n");
1816 x86_mcinfo_dump(mctelem_dataptr(mctc));
1817 ret = 1;
1818 break;
1819
1820 default:
1821 ret = 0;
1822 break;
1823 }
1824 return ret;
1825 }
1826
1827 /* Softirq Handler for this MCE# processing */
mce_softirq(void)1828 static void mce_softirq(void)
1829 {
1830 static DEFINE_MCE_BARRIER(mce_inside_bar);
1831 static DEFINE_MCE_BARRIER(mce_severity_bar);
1832 static atomic_t severity_cpu;
1833 int cpu = smp_processor_id();
1834 unsigned int workcpu;
1835 bool lmce = mctelem_has_deferred_lmce(cpu);
1836 bool bcast = mce_broadcast && !lmce;
1837
1838 mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
1839
1840 mce_barrier_enter(&mce_inside_bar, bcast);
1841
1842 if ( !lmce )
1843 {
1844 /*
1845 * Everybody is here. Now let's see who gets to do the
1846 * recovery work. Right now we just see if there's a CPU
1847 * that did not have any problems, and pick that one.
1848 *
1849 * First, just set a default value: the last CPU who reaches this
1850 * will overwrite the value and become the default.
1851 */
1852
1853 atomic_set(&severity_cpu, cpu);
1854
1855 mce_barrier_enter(&mce_severity_bar, bcast);
1856 if ( !mctelem_has_deferred(cpu) )
1857 atomic_set(&severity_cpu, cpu);
1858 mce_barrier_exit(&mce_severity_bar, bcast);
1859 }
1860
1861 /* We choose severity_cpu for further processing */
1862 if ( lmce || atomic_read(&severity_cpu) == cpu )
1863 {
1864
1865 mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
1866
1867 /*
1868 * Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
1869 * vMCE MSRs virtualization buffer
1870 */
1871
1872 if ( lmce )
1873 mctelem_process_deferred(cpu, mce_delayed_action, true);
1874 else
1875 for_each_online_cpu(workcpu)
1876 mctelem_process_deferred(workcpu, mce_delayed_action, false);
1877
1878 /* Step2: Send Log to DOM0 through vIRQ */
1879 if ( dom0_vmce_enabled() )
1880 {
1881 mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
1882 send_global_virq(VIRQ_MCA);
1883 }
1884 }
1885
1886 mce_barrier_exit(&mce_inside_bar, bcast);
1887 }
1888
1889 /*
1890 * Machine Check owner judge algorithm:
1891 * When error happens, all cpus serially read its msr banks.
1892 * The first CPU who fetches the error bank's info will clear
1893 * this bank. Later readers can't get any infor again.
1894 * The first CPU is the actual mce_owner
1895 *
1896 * For Fatal (pcc=1) error, it might cause machine crash
1897 * before we're able to log. For avoiding log missing, we adopt two
1898 * round scanning:
1899 * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
1900 * All MCE banks are sticky, when boot up, MCE polling mechanism
1901 * will help to collect and log those MCE errors.
1902 * Round2: Do all MCE processing logic as normal.
1903 */
mce_handler_init(void)1904 void mce_handler_init(void)
1905 {
1906 if ( smp_processor_id() != 0 )
1907 return;
1908
1909 /* callback register, do we really need so many callback? */
1910 /* mce handler data initialization */
1911 spin_lock_init(&mce_logout_lock);
1912 open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
1913 }
1914