1 /*
2  * mce.c - x86 Machine Check Exception Reporting
3  * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4  */
5 
6 #include <xen/init.h>
7 #include <xen/types.h>
8 #include <xen/kernel.h>
9 #include <xen/param.h>
10 #include <xen/smp.h>
11 #include <xen/errno.h>
12 #include <xen/console.h>
13 #include <xen/sched.h>
14 #include <xen/cpumask.h>
15 #include <xen/event.h>
16 #include <xen/guest_access.h>
17 #include <xen/hypercall.h> /* for do_mca */
18 #include <xen/cpu.h>
19 
20 #include <asm/processor.h>
21 #include <asm/setup.h>
22 #include <asm/system.h>
23 #include <asm/apic.h>
24 #include <asm/msr.h>
25 #include <asm/p2m.h>
26 
27 #include "mce.h"
28 #include "barrier.h"
29 #include "mcaction.h"
30 #include "util.h"
31 #include "vmce.h"
32 
33 bool __read_mostly opt_mce = true;
34 boolean_param("mce", opt_mce);
35 bool __read_mostly mce_broadcast;
36 bool is_mc_panic;
37 DEFINE_PER_CPU_READ_MOSTLY(unsigned int, nr_mce_banks);
38 unsigned int __read_mostly firstbank;
39 unsigned int __read_mostly ppin_msr;
40 uint8_t __read_mostly cmci_apic_vector;
41 
42 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask);
43 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks);
44 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks);
45 
46 static void intpose_init(void);
47 static void mcinfo_clear(struct mc_info *);
48 struct mca_banks *mca_allbanks;
49 
50 #define SEG_PL(segsel)   ((segsel) & 0x3)
51 #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16)
52 
53 #if 0
54 #define x86_mcerr(fmt, err, args...)                                    \
55     ({                                                                  \
56         int _err = (err);                                               \
57         gdprintk(XENLOG_WARNING, "x86_mcerr: " fmt ", returning %d\n",  \
58                  ## args, _err);                                        \
59         _err;                                                           \
60     })
61 #else
62 #define x86_mcerr(fmt, err, args...) (err)
63 #endif
64 
65 int mce_verbosity;
mce_set_verbosity(const char * str)66 static int __init mce_set_verbosity(const char *str)
67 {
68     if ( strcmp("verbose", str) == 0 )
69         mce_verbosity = MCE_VERBOSE;
70     else
71         return -EINVAL;
72 
73     return 0;
74 }
75 custom_param("mce_verbosity", mce_set_verbosity);
76 
77 /* Handle unconfigured int18 (should never happen) */
unexpected_machine_check(const struct cpu_user_regs * regs)78 static void unexpected_machine_check(const struct cpu_user_regs *regs)
79 {
80     console_force_unlock();
81     printk("Unexpected Machine Check Exception\n");
82     fatal_trap(regs, 1);
83 }
84 
85 static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
86 
x86_mce_vector_register(x86_mce_vector_t hdlr)87 void x86_mce_vector_register(x86_mce_vector_t hdlr)
88 {
89     _machine_check_vector = hdlr;
90 }
91 
92 /* Call the installed machine check handler for this CPU setup. */
93 
do_machine_check(const struct cpu_user_regs * regs)94 void do_machine_check(const struct cpu_user_regs *regs)
95 {
96     mce_enter();
97     _machine_check_vector(regs);
98     mce_exit();
99 }
100 
101 /*
102  * Init machine check callback handler
103  * It is used to collect additional information provided by newer
104  * CPU families/models without the need to duplicate the whole handler.
105  * This avoids having many handlers doing almost nearly the same and each
106  * with its own tweaks ands bugs.
107  */
108 static x86_mce_callback_t mc_callback_bank_extended = NULL;
109 
x86_mce_callback_register(x86_mce_callback_t cbfunc)110 void x86_mce_callback_register(x86_mce_callback_t cbfunc)
111 {
112     mc_callback_bank_extended = cbfunc;
113 }
114 
115 /*
116  * Machine check recoverable judgement callback handler
117  * It is used to judge whether an UC error is recoverable by software
118  */
119 static mce_recoverable_t mc_recoverable_scan = NULL;
120 
mce_recoverable_register(mce_recoverable_t cbfunc)121 void mce_recoverable_register(mce_recoverable_t cbfunc)
122 {
123     mc_recoverable_scan = cbfunc;
124 }
125 
mcabanks_alloc(unsigned int nr_mce_banks)126 struct mca_banks *mcabanks_alloc(unsigned int nr_mce_banks)
127 {
128     struct mca_banks *mb;
129 
130     mb = xmalloc(struct mca_banks);
131     if ( !mb )
132         return NULL;
133 
134     /*
135      * For APs allocations get done by the BSP, i.e. when the bank count may
136      * may not be known yet. A zero bank count is a clear indication of this.
137      */
138     if ( !nr_mce_banks )
139         nr_mce_banks = MCG_CAP_COUNT;
140 
141     mb->bank_map = xzalloc_array(unsigned long,
142                                  BITS_TO_LONGS(nr_mce_banks));
143     if ( !mb->bank_map )
144     {
145         xfree(mb);
146         return NULL;
147     }
148 
149     mb->num = nr_mce_banks;
150 
151     return mb;
152 }
153 
mcabanks_free(struct mca_banks * banks)154 void mcabanks_free(struct mca_banks *banks)
155 {
156     if ( banks == NULL )
157         return;
158     if ( banks->bank_map )
159         xfree(banks->bank_map);
160     xfree(banks);
161 }
162 
mcabank_clear(int banknum)163 static void mcabank_clear(int banknum)
164 {
165     uint64_t status;
166 
167     status = mca_rdmsr(MSR_IA32_MCx_STATUS(banknum));
168 
169     if ( status & MCi_STATUS_ADDRV )
170         mca_wrmsr(MSR_IA32_MCx_ADDR(banknum), 0x0ULL);
171     if ( status & MCi_STATUS_MISCV )
172         mca_wrmsr(MSR_IA32_MCx_MISC(banknum), 0x0ULL);
173 
174     mca_wrmsr(MSR_IA32_MCx_STATUS(banknum), 0x0ULL);
175 }
176 
177 /*
178  * Judging whether to Clear Machine Check error bank callback handler
179  * According to Intel latest MCA OS Recovery Writer's Guide,
180  * whether the error MCA bank needs to be cleared is decided by the mca_source
181  * and MCi_status bit value.
182  */
183 static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
184 
mce_need_clearbank_register(mce_need_clearbank_t cbfunc)185 void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
186 {
187     mc_need_clearbank_scan = cbfunc;
188 }
189 
190 /*
191  * mce_logout_lock should only be used in the trap handler,
192  * while MCIP has not been cleared yet in the global status
193  * register. Other use is not safe, since an MCE trap can
194  * happen at any moment, which would cause lock recursion.
195  */
196 static DEFINE_SPINLOCK(mce_logout_lock);
197 
198 const struct mca_error_handler *__read_mostly mce_dhandlers;
199 const struct mca_error_handler *__read_mostly mce_uhandlers;
200 unsigned int __read_mostly mce_dhandler_num;
201 unsigned int __read_mostly mce_uhandler_num;
202 
mca_init_bank(enum mca_source who,struct mc_info * mi,int bank)203 static void mca_init_bank(enum mca_source who, struct mc_info *mi, int bank)
204 {
205     struct mcinfo_bank *mib;
206 
207     if ( !mi )
208         return;
209 
210     mib = x86_mcinfo_reserve(mi, sizeof(*mib), MC_TYPE_BANK);
211     if ( !mib )
212     {
213         mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
214         return;
215     }
216 
217     mib->mc_status = mca_rdmsr(MSR_IA32_MCx_STATUS(bank));
218 
219     mib->mc_bank = bank;
220     mib->mc_domid = DOMID_INVALID;
221 
222     if ( mib->mc_status & MCi_STATUS_MISCV )
223         mib->mc_misc = mca_rdmsr(MSR_IA32_MCx_MISC(bank));
224 
225     if ( mib->mc_status & MCi_STATUS_ADDRV )
226         mib->mc_addr = mca_rdmsr(MSR_IA32_MCx_ADDR(bank));
227 
228     if ( (mib->mc_status & MCi_STATUS_MISCV) &&
229          (mib->mc_status & MCi_STATUS_ADDRV) &&
230          (mc_check_addr(mib->mc_status, mib->mc_misc, MC_ADDR_PHYSICAL)) &&
231          (who == MCA_POLLER || who == MCA_CMCI_HANDLER) &&
232          (mfn_valid(_mfn(paddr_to_pfn(mib->mc_addr)))) )
233     {
234         struct domain *d;
235 
236         d = maddr_get_owner(mib->mc_addr);
237         if ( d )
238             mib->mc_domid = d->domain_id;
239     }
240 
241     if ( who == MCA_CMCI_HANDLER )
242     {
243         mib->mc_ctrl2 = mca_rdmsr(MSR_IA32_MC0_CTL2 + bank);
244         mib->mc_tsc = rdtsc();
245     }
246 }
247 
mca_init_global(uint32_t flags,struct mcinfo_global * mig)248 static int mca_init_global(uint32_t flags, struct mcinfo_global *mig)
249 {
250     uint64_t status;
251     int cpu_nr;
252     const struct vcpu *curr = current;
253 
254     /* Set global information */
255     status = mca_rdmsr(MSR_IA32_MCG_STATUS);
256     mig->mc_gstatus = status;
257     mig->mc_domid = DOMID_INVALID;
258     mig->mc_vcpuid = XEN_MC_VCPUID_INVALID;
259     mig->mc_flags = flags;
260     cpu_nr = smp_processor_id();
261     /* Retrieve detector information */
262     x86_mc_get_cpu_info(cpu_nr, &mig->mc_socketid,
263                         &mig->mc_coreid, &mig->mc_core_threadid,
264                         &mig->mc_apicid, NULL, NULL, NULL);
265 
266     if ( curr )
267     {
268         mig->mc_domid = curr->domain->domain_id;
269         mig->mc_vcpuid = curr->vcpu_id;
270     }
271 
272     return 0;
273 }
274 
275 /*
276  * Utility function to perform MCA bank telemetry readout and to push that
277  * telemetry towards an interested dom0 for logging and diagnosis.
278  * The caller - #MC handler or MCA poll function - must arrange that we
279  * do not migrate cpus.
280  */
281 
282 /* XXFM Could add overflow counting? */
283 
284 /*
285  *  Add out_param clear_bank for Machine Check Handler Caller.
286  * For Intel latest CPU, whether to clear the error bank status needs to
287  * be judged by the callback function defined above.
288  */
289 mctelem_cookie_t
mcheck_mca_logout(enum mca_source who,struct mca_banks * bankmask,struct mca_summary * sp,struct mca_banks * clear_bank)290 mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask,
291                   struct mca_summary *sp, struct mca_banks *clear_bank)
292 {
293     uint64_t gstatus, status;
294     struct mcinfo_global *mig = NULL; /* on stack */
295     mctelem_cookie_t mctc = NULL;
296     bool uc = false, pcc = false, recover = true, need_clear = true;
297     uint32_t mc_flags = 0;
298     struct mc_info *mci = NULL;
299     mctelem_class_t which = MC_URGENT; /* XXXgcc */
300     int errcnt = 0;
301     int i;
302 
303     gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
304     switch ( who )
305     {
306     case MCA_MCE_SCAN:
307         mc_flags = MC_FLAG_MCE;
308         which = MC_URGENT;
309         break;
310 
311     case MCA_POLLER:
312     case MCA_RESET:
313         mc_flags = MC_FLAG_POLLED;
314         which = MC_NONURGENT;
315         break;
316 
317     case MCA_CMCI_HANDLER:
318         mc_flags = MC_FLAG_CMCI;
319         which = MC_NONURGENT;
320         break;
321 
322     default:
323         BUG();
324     }
325 
326     /*
327      * If no mc_recovery_scan callback handler registered,
328      * this error is not recoverable
329      */
330     recover = mc_recoverable_scan ? 1 : 0;
331 
332     for ( i = 0; i < this_cpu(nr_mce_banks); i++ )
333     {
334         /* Skip bank if corresponding bit in bankmask is clear */
335         if ( !mcabanks_test(i, bankmask) )
336             continue;
337 
338         status = mca_rdmsr(MSR_IA32_MCx_STATUS(i));
339         if ( !(status & MCi_STATUS_VAL) )
340             continue; /* this bank has no valid telemetry */
341 
342         /*
343          * For Intel Latest CPU CMCI/MCE Handler caller, we need to
344          * decide whether to clear bank by MCi_STATUS bit value such as
345          * OVER/UC/EN/PCC/S/AR
346          */
347         if ( mc_need_clearbank_scan )
348             need_clear = mc_need_clearbank_scan(who, status);
349 
350         /*
351          * If this is the first bank with valid MCA DATA, then
352          * try to reserve an entry from the urgent/nonurgent queue
353          * depending on whether we are called from an exception or
354          * a poller;  this can fail (for example dom0 may not
355          * yet have consumed past telemetry).
356          */
357         if ( errcnt++ == 0 )
358         {
359             mctc = mctelem_reserve(which);
360             if ( mctc )
361             {
362                 mci = mctelem_dataptr(mctc);
363                 mcinfo_clear(mci);
364                 mig = x86_mcinfo_reserve(mci, sizeof(*mig), MC_TYPE_GLOBAL);
365                 /* mc_info should at least hold up the global information */
366                 ASSERT(mig);
367                 mca_init_global(mc_flags, mig);
368                 /* A hook here to get global extended msrs */
369                 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
370                     intel_get_extended_msrs(mig, mci);
371             }
372         }
373 
374         /* flag for uncorrected errors */
375         if ( !uc && ((status & MCi_STATUS_UC) != 0) )
376             uc = true;
377 
378         /* flag processor context corrupt */
379         if ( !pcc && ((status & MCi_STATUS_PCC) != 0) )
380             pcc = true;
381 
382         if ( recover && uc )
383             /* uc = true, recover = true, we need not panic. */
384             recover = mc_recoverable_scan(status);
385 
386         mca_init_bank(who, mci, i);
387 
388         if ( mc_callback_bank_extended )
389             mc_callback_bank_extended(mci, i, status);
390 
391         /* By default, need_clear = true */
392         if ( who != MCA_MCE_SCAN && need_clear )
393             /* Clear bank */
394             mcabank_clear(i);
395         else if ( who == MCA_MCE_SCAN && need_clear )
396             mcabanks_set(i, clear_bank);
397     }
398 
399     if ( mig && errcnt > 0 )
400     {
401         if ( pcc )
402             mig->mc_flags |= MC_FLAG_UNCORRECTABLE;
403         else if ( uc )
404             mig->mc_flags |= MC_FLAG_RECOVERABLE;
405         else
406             mig->mc_flags |= MC_FLAG_CORRECTABLE;
407     }
408 
409     if ( sp )
410     {
411         sp->errcnt = errcnt;
412         sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
413         sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
414         sp->lmce = (gstatus & MCG_STATUS_LMCE) != 0;
415         sp->uc = uc;
416         sp->pcc = pcc;
417         sp->recoverable = recover;
418     }
419 
420     return mci != NULL ? mctc : NULL; /* may be NULL */
421 }
422 
mce_spin_lock(spinlock_t * lk)423 static void mce_spin_lock(spinlock_t *lk)
424 {
425     while ( !spin_trylock(lk) )
426     {
427         cpu_relax();
428         mce_panic_check();
429     }
430 }
431 
mce_spin_unlock(spinlock_t * lk)432 static void mce_spin_unlock(spinlock_t *lk)
433 {
434     spin_unlock(lk);
435 }
436 
437 static enum mce_result mce_action(const struct cpu_user_regs *regs,
438                                   mctelem_cookie_t mctc);
439 
440 /*
441  * Return:
442  * -1: if system can't be recovered
443  * 0: Continue to next step
444  */
mce_urgent_action(const struct cpu_user_regs * regs,mctelem_cookie_t mctc)445 static int mce_urgent_action(const struct cpu_user_regs *regs,
446                              mctelem_cookie_t mctc)
447 {
448     uint64_t gstatus;
449 
450     if ( mctc == NULL )
451         return 0;
452 
453     gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
454 
455     /*
456      * FIXME: When RIPV = EIPV = 0, it's a little bit tricky. It may be an
457      * asynchronic error, currently we have no way to precisely locate
458      * whether the error occur at guest or hypervisor.
459      * To avoid handling error in wrong way, we treat it as unrecovered.
460      *
461      * Another unrecovered case is RIPV = 0 while in hypervisor
462      * since Xen is not pre-emptible.
463      */
464     if ( !(gstatus & MCG_STATUS_RIPV) &&
465          (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)) )
466         return -1;
467 
468     return mce_action(regs, mctc) == MCER_RESET ? -1 : 0;
469 }
470 
471 /* Shared #MC handler. */
mcheck_cmn_handler(const struct cpu_user_regs * regs)472 void mcheck_cmn_handler(const struct cpu_user_regs *regs)
473 {
474     static DEFINE_MCE_BARRIER(mce_trap_bar);
475     static atomic_t severity_cpu = ATOMIC_INIT(-1);
476     static atomic_t found_error = ATOMIC_INIT(0);
477     static cpumask_t mce_fatal_cpus;
478     struct mca_banks *bankmask = mca_allbanks;
479     unsigned int cpu = smp_processor_id();
480     struct mca_banks *clear_bank = per_cpu(mce_clear_banks, cpu);
481     uint64_t gstatus;
482     mctelem_cookie_t mctc = NULL;
483     struct mca_summary bs;
484     bool bcast, lmce;
485 
486     mce_spin_lock(&mce_logout_lock);
487 
488     if ( clear_bank != NULL )
489         memset(clear_bank->bank_map, 0x0,
490                sizeof(long) * BITS_TO_LONGS(clear_bank->num));
491     mctc = mcheck_mca_logout(MCA_MCE_SCAN, bankmask, &bs, clear_bank);
492     lmce = bs.lmce;
493     bcast = mce_broadcast && !lmce;
494 
495     if ( bs.errcnt )
496     {
497         /*
498          * Uncorrected errors must be dealt with in softirq context.
499          */
500         if ( bs.uc || bs.pcc )
501         {
502             add_taint(TAINT_MACHINE_CHECK);
503             if ( mctc )
504                 mctelem_defer(mctc, lmce);
505             /*
506              * For PCC=1 and can't be recovered, context is lost, so
507              * reboot now without clearing the banks, and deal with
508              * the telemetry after reboot (the MSRs are sticky)
509              */
510             if ( bs.pcc || !bs.recoverable )
511                 cpumask_set_cpu(cpu, &mce_fatal_cpus);
512         }
513         else if ( mctc != NULL )
514             mctelem_commit(mctc);
515         atomic_set(&found_error, 1);
516 
517         /* The last CPU will be take check/clean-up etc */
518         atomic_set(&severity_cpu, cpu);
519 
520         mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%u\n",
521                    *((unsigned long *)clear_bank), cpu);
522         if ( clear_bank != NULL )
523             mcheck_mca_clearbanks(clear_bank);
524     }
525     else if ( mctc != NULL )
526         mctelem_dismiss(mctc);
527     mce_spin_unlock(&mce_logout_lock);
528 
529     mce_barrier_enter(&mce_trap_bar, bcast);
530     if ( mctc != NULL && mce_urgent_action(regs, mctc) )
531         cpumask_set_cpu(cpu, &mce_fatal_cpus);
532     mce_barrier_exit(&mce_trap_bar, bcast);
533 
534     /*
535      * Wait until everybody has processed the trap.
536      */
537     mce_barrier_enter(&mce_trap_bar, bcast);
538     if ( lmce || atomic_read(&severity_cpu) == cpu )
539     {
540         /*
541          * According to SDM, if no error bank found on any cpus,
542          * something unexpected happening, we can't do any
543          * recovery job but to reset the system.
544          */
545         if ( atomic_read(&found_error) == 0 )
546             mc_panic("MCE: No CPU found valid MCE, need reset");
547         if ( !cpumask_empty(&mce_fatal_cpus) )
548         {
549             char ebuf[96];
550 
551             snprintf(ebuf, sizeof(ebuf),
552                      "MCE: Fatal error happened on CPUs %*pb",
553                      CPUMASK_PR(&mce_fatal_cpus));
554 
555             mc_panic(ebuf);
556         }
557         atomic_set(&found_error, 0);
558         atomic_set(&severity_cpu, -1);
559     }
560     mce_barrier_exit(&mce_trap_bar, bcast);
561 
562     /* Clear flags after above fatal check */
563     mce_barrier_enter(&mce_trap_bar, bcast);
564     gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
565     if ( (gstatus & MCG_STATUS_MCIP) != 0 )
566     {
567         mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
568         mca_wrmsr(MSR_IA32_MCG_STATUS, 0);
569     }
570     mce_barrier_exit(&mce_trap_bar, bcast);
571 
572     raise_softirq(MACHINE_CHECK_SOFTIRQ);
573 }
574 
mcheck_mca_clearbanks(struct mca_banks * bankmask)575 void mcheck_mca_clearbanks(struct mca_banks *bankmask)
576 {
577     int i;
578 
579     for ( i = 0; i < this_cpu(nr_mce_banks); i++ )
580     {
581         if ( !mcabanks_test(i, bankmask) )
582             continue;
583         mcabank_clear(i);
584     }
585 }
586 
587 /*check the existence of Machine Check*/
mce_available(const struct cpuinfo_x86 * c)588 bool mce_available(const struct cpuinfo_x86 *c)
589 {
590     return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
591 }
592 
593 /*
594  * Check if bank 0 is usable for MCE. It isn't for Intel P6 family
595  * before model 0x1a.
596  */
mce_firstbank(struct cpuinfo_x86 * c)597 unsigned int mce_firstbank(struct cpuinfo_x86 *c)
598 {
599     return c->x86 == 6 &&
600            c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a;
601 }
602 
show_mca_info(int inited,struct cpuinfo_x86 * c)603 int show_mca_info(int inited, struct cpuinfo_x86 *c)
604 {
605     static enum mcheck_type g_type = mcheck_unset;
606 
607     if ( inited != g_type )
608     {
609         char prefix[20];
610         static const char *const type_str[] = {
611             [mcheck_amd_famXX] = "AMD",
612             [mcheck_amd_k8] = "AMD K8",
613             [mcheck_intel] = "Intel",
614             [mcheck_hygon] = "Hygon"
615         };
616 
617         snprintf(prefix, ARRAY_SIZE(prefix), "%sCPU%u: ",
618                  g_type != mcheck_unset ? XENLOG_WARNING : XENLOG_INFO,
619                  smp_processor_id());
620         BUG_ON(inited >= ARRAY_SIZE(type_str));
621         switch ( inited )
622         {
623         default:
624             printk("%s%s machine check reporting enabled\n",
625                    prefix, type_str[inited]);
626             break;
627 
628         case mcheck_amd_famXX:
629         case mcheck_hygon:
630             printk("%s%s Fam%xh machine check reporting enabled\n",
631                    prefix, type_str[inited], c->x86);
632             break;
633 
634         case mcheck_none:
635             printk("%sNo machine check initialization\n", prefix);
636             break;
637         }
638         g_type = inited;
639     }
640 
641     return 0;
642 }
643 
set_poll_bankmask(struct cpuinfo_x86 * c)644 static void set_poll_bankmask(struct cpuinfo_x86 *c)
645 {
646     int cpu = smp_processor_id();
647     struct mca_banks *mb;
648 
649     mb = per_cpu(poll_bankmask, cpu);
650     BUG_ON(!mb);
651 
652     if ( cmci_support && opt_mce )
653     {
654         const struct mca_banks *cmci = per_cpu(no_cmci_banks, cpu);
655 
656         if ( unlikely(cmci->num < mb->num) )
657             bitmap_fill(mb->bank_map, mb->num);
658         bitmap_copy(mb->bank_map, cmci->bank_map, min(mb->num, cmci->num));
659     }
660     else
661     {
662         bitmap_copy(mb->bank_map, mca_allbanks->bank_map,
663                     per_cpu(nr_mce_banks, cpu));
664         if ( mce_firstbank(c) )
665             mcabanks_clear(0, mb);
666     }
667 }
668 
669 /* The perbank ctl/status init is platform specific because of AMD's quirk */
mca_cap_init(void)670 static int mca_cap_init(void)
671 {
672     uint64_t msr_content;
673     unsigned int nr, cpu = smp_processor_id();
674 
675     rdmsrl(MSR_IA32_MCG_CAP, msr_content);
676 
677     if ( msr_content & MCG_CTL_P ) /* Control register present ? */
678         wrmsrl(MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
679 
680     per_cpu(nr_mce_banks, cpu) = nr = MASK_EXTR(msr_content, MCG_CAP_COUNT);
681 
682     if ( !nr )
683     {
684         printk(XENLOG_INFO
685                "CPU%u: No MCE banks present. Machine check support disabled\n",
686                cpu);
687         return -ENODEV;
688     }
689 
690     /* mcabanks_alloc depends on nr_mce_banks */
691     if ( !mca_allbanks || nr > mca_allbanks->num )
692     {
693         unsigned int i;
694         struct mca_banks *all = mcabanks_alloc(nr);
695 
696         if ( !all )
697             return -ENOMEM;
698         for ( i = 0; i < nr; i++ )
699             mcabanks_set(i, mca_allbanks);
700         mcabanks_free(xchg(&mca_allbanks, all));
701     }
702 
703     return 0;
704 }
705 
cpu_bank_free(unsigned int cpu)706 static void cpu_bank_free(unsigned int cpu)
707 {
708     struct mca_banks *poll = per_cpu(poll_bankmask, cpu);
709     struct mca_banks *clr = per_cpu(mce_clear_banks, cpu);
710 
711     mcabanks_free(poll);
712     mcabanks_free(clr);
713 
714     per_cpu(poll_bankmask, cpu) = NULL;
715     per_cpu(mce_clear_banks, cpu) = NULL;
716 }
717 
cpu_bank_alloc(unsigned int cpu)718 static int cpu_bank_alloc(unsigned int cpu)
719 {
720     unsigned int nr = per_cpu(nr_mce_banks, cpu);
721     struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc(nr);
722     struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc(nr);
723 
724     if ( !poll || !clr )
725     {
726         mcabanks_free(poll);
727         mcabanks_free(clr);
728         return -ENOMEM;
729     }
730 
731     per_cpu(poll_bankmask, cpu) = poll;
732     per_cpu(mce_clear_banks, cpu) = clr;
733     return 0;
734 }
735 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)736 static int cpu_callback(
737     struct notifier_block *nfb, unsigned long action, void *hcpu)
738 {
739     unsigned int cpu = (unsigned long)hcpu;
740     int rc = 0;
741 
742     switch ( action )
743     {
744     case CPU_UP_PREPARE:
745         rc = cpu_bank_alloc(cpu);
746         break;
747 
748     case CPU_UP_CANCELED:
749     case CPU_DEAD:
750         if ( !park_offline_cpus )
751             cpu_bank_free(cpu);
752         break;
753 
754     case CPU_REMOVE:
755         if ( park_offline_cpus )
756             cpu_bank_free(cpu);
757         break;
758     }
759 
760     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
761 }
762 
763 static struct notifier_block cpu_nfb = {
764     .notifier_call = cpu_callback
765 };
766 
767 /* This has to be run for each processor */
mcheck_init(struct cpuinfo_x86 * c,bool bsp)768 void mcheck_init(struct cpuinfo_x86 *c, bool bsp)
769 {
770     enum mcheck_type inited = mcheck_none;
771     unsigned int cpu = smp_processor_id();
772 
773     if ( !opt_mce )
774     {
775         if ( bsp )
776             printk(XENLOG_INFO "MCE support disabled by bootparam\n");
777         return;
778     }
779 
780     if ( !mce_available(c) )
781     {
782         printk(XENLOG_INFO "CPU%i: No machine check support available\n", cpu);
783         return;
784     }
785 
786     /*Hardware Enable */
787     if ( mca_cap_init() )
788         return;
789 
790     if ( !bsp )
791     {
792         per_cpu(poll_bankmask, cpu)->num = per_cpu(nr_mce_banks, cpu);
793         per_cpu(mce_clear_banks, cpu)->num = per_cpu(nr_mce_banks, cpu);
794     }
795     else if ( cpu_bank_alloc(cpu) )
796         panic("Insufficient memory for MCE bank allocations\n");
797 
798     switch ( c->x86_vendor )
799     {
800     case X86_VENDOR_AMD:
801     case X86_VENDOR_HYGON:
802         inited = amd_mcheck_init(c);
803         break;
804 
805     case X86_VENDOR_INTEL:
806         switch ( c->x86 )
807         {
808         case 6:
809         case 15:
810             inited = intel_mcheck_init(c, bsp);
811             break;
812         }
813         break;
814 
815     default:
816         break;
817     }
818 
819     show_mca_info(inited, c);
820     if ( inited == mcheck_none || inited == mcheck_unset )
821         goto out;
822 
823     intpose_init();
824 
825     if ( bsp )
826     {
827         mctelem_init(sizeof(struct mc_info));
828         register_cpu_notifier(&cpu_nfb);
829     }
830 
831     /* Turn on MCE now */
832     set_in_cr4(X86_CR4_MCE);
833 
834     set_poll_bankmask(c);
835 
836     return;
837  out:
838     if ( bsp )
839     {
840         cpu_bank_free(smp_processor_id());
841         mcabanks_free(mca_allbanks);
842         mca_allbanks = NULL;
843     }
844 }
845 
mcinfo_clear(struct mc_info * mi)846 static void mcinfo_clear(struct mc_info *mi)
847 {
848     memset(mi, 0, sizeof(struct mc_info));
849     x86_mcinfo_nentries(mi) = 0;
850 }
851 
x86_mcinfo_reserve(struct mc_info * mi,unsigned int size,unsigned int type)852 void *x86_mcinfo_reserve(struct mc_info *mi,
853                          unsigned int size, unsigned int type)
854 {
855     int i;
856     unsigned long end1, end2;
857     struct mcinfo_common *mic_base, *mic_index;
858 
859     mic_index = mic_base = x86_mcinfo_first(mi);
860 
861     /* go to first free entry */
862     for ( i = 0; i < x86_mcinfo_nentries(mi); i++ )
863         mic_index = x86_mcinfo_next(mic_index);
864 
865     /* check if there is enough size */
866     end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
867     end2 = (unsigned long)((uint8_t *)mic_index + size);
868 
869     if ( end1 < end2 )
870     {
871         mce_printk(MCE_CRITICAL,
872                    "mcinfo_add: No space left in mc_info\n");
873         return NULL;
874     }
875 
876     /* there's enough space. add entry. */
877     x86_mcinfo_nentries(mi)++;
878 
879     memset(mic_index, 0, size);
880     mic_index->size = size;
881     mic_index->type = type;
882 
883     return mic_index;
884 }
885 
x86_mcinfo_apei_save(struct mcinfo_global * mc_global,struct mcinfo_bank * mc_bank)886 static void x86_mcinfo_apei_save(
887     struct mcinfo_global *mc_global, struct mcinfo_bank *mc_bank)
888 {
889     struct mce m;
890 
891     memset(&m, 0, sizeof(struct mce));
892 
893     m.cpu = mc_global->mc_coreid;
894     m.cpuvendor = boot_cpu_data.x86_vendor;
895     m.cpuid = cpuid_eax(1);
896     m.socketid = mc_global->mc_socketid;
897     m.apicid = mc_global->mc_apicid;
898 
899     m.mcgstatus = mc_global->mc_gstatus;
900     m.status = mc_bank->mc_status;
901     m.misc = mc_bank->mc_misc;
902     m.addr = mc_bank->mc_addr;
903     m.bank = mc_bank->mc_bank;
904 
905     apei_write_mce(&m);
906 }
907 
908 /*
909  * Dump machine check information in a format,
910  * mcelog can parse. This is used only when
911  * Dom0 does not take the notification.
912  */
x86_mcinfo_dump(struct mc_info * mi)913 void x86_mcinfo_dump(struct mc_info *mi)
914 {
915     struct mcinfo_common *mic = NULL;
916     struct mcinfo_global *mc_global;
917     struct mcinfo_bank *mc_bank;
918 
919     /* first print the global info */
920     x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
921     if ( mic == NULL )
922         return;
923     mc_global = (struct mcinfo_global *)mic;
924     if ( mc_global->mc_flags & MC_FLAG_MCE )
925         printk(XENLOG_WARNING
926                "CPU%d: Machine Check Exception: %16"PRIx64"\n",
927                mc_global->mc_coreid, mc_global->mc_gstatus);
928     else if ( mc_global->mc_flags & MC_FLAG_CMCI )
929         printk(XENLOG_WARNING "CMCI occurred on CPU %d.\n",
930                mc_global->mc_coreid);
931     else if ( mc_global->mc_flags & MC_FLAG_POLLED )
932         printk(XENLOG_WARNING "POLLED occurred on CPU %d.\n",
933                mc_global->mc_coreid);
934 
935     /* then the bank information */
936     x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
937     do {
938         if ( mic == NULL )
939             return;
940         if ( mic->type != MC_TYPE_BANK )
941             goto next;
942 
943         mc_bank = (struct mcinfo_bank *)mic;
944 
945         printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
946                mc_bank->mc_bank,
947                mc_bank->mc_status);
948         if ( mc_bank->mc_status & MCi_STATUS_MISCV )
949             printk("[%16"PRIx64"]", mc_bank->mc_misc);
950         if ( mc_bank->mc_status & MCi_STATUS_ADDRV )
951             printk(" at %16"PRIx64, mc_bank->mc_addr);
952         printk("\n");
953 
954         if ( is_mc_panic )
955             x86_mcinfo_apei_save(mc_global, mc_bank);
956 
957  next:
958         mic = x86_mcinfo_next(mic); /* next entry */
959         if ( (mic == NULL) || (mic->size == 0) )
960             break;
961     } while ( 1 );
962 }
963 
do_mc_get_cpu_info(void * v)964 static void do_mc_get_cpu_info(void *v)
965 {
966     int cpu = smp_processor_id();
967     int cindex, cpn;
968     struct cpuinfo_x86 *c;
969     xen_mc_logical_cpu_t *log_cpus, *xcp;
970     uint32_t junk, ebx;
971 
972     log_cpus = v;
973     c = &cpu_data[cpu];
974     cindex = 0;
975     cpn = cpu - 1;
976 
977     /*
978      * Deal with sparse masks, condensed into a contig array.
979      */
980     while ( cpn >= 0 )
981     {
982         if ( cpu_online(cpn) )
983             cindex++;
984         cpn--;
985     }
986 
987     xcp = &log_cpus[cindex];
988     c = &cpu_data[cpu];
989     xcp->mc_cpunr = cpu;
990     x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
991                         &xcp->mc_coreid, &xcp->mc_threadid,
992                         &xcp->mc_apicid, &xcp->mc_ncores,
993                         &xcp->mc_ncores_active, &xcp->mc_nthreads);
994     xcp->mc_cpuid_level = c->cpuid_level;
995     xcp->mc_family = c->x86;
996     xcp->mc_vendor = c->x86_vendor;
997     xcp->mc_model = c->x86_model;
998     xcp->mc_step = c->x86_mask;
999     xcp->mc_cache_size = c->x86_cache_size;
1000     xcp->mc_cache_alignment = c->x86_cache_alignment;
1001     memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
1002     memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
1003     memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
1004 
1005     /*
1006      * This part needs to run on the CPU itself.
1007      */
1008     xcp->mc_nmsrvals = 1;
1009     xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
1010     rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
1011 
1012     if ( ppin_msr && xcp->mc_nmsrvals < ARRAY_SIZE(xcp->mc_msrvalues) )
1013     {
1014         xcp->mc_msrvalues[xcp->mc_nmsrvals].reg = ppin_msr;
1015         rdmsrl(ppin_msr, xcp->mc_msrvalues[xcp->mc_nmsrvals].value);
1016         ++xcp->mc_nmsrvals;
1017     }
1018 
1019     if ( c->cpuid_level >= 1 )
1020     {
1021         cpuid(1, &junk, &ebx, &junk, &junk);
1022         xcp->mc_clusterid = (ebx >> 24) & 0xff;
1023     }
1024     else
1025         xcp->mc_clusterid = get_apic_id();
1026 }
1027 
x86_mc_get_cpu_info(unsigned cpu,uint32_t * chipid,uint16_t * coreid,uint16_t * threadid,uint32_t * apicid,unsigned * ncores,unsigned * ncores_active,unsigned * nthreads)1028 void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
1029                          uint16_t *threadid, uint32_t *apicid,
1030                          unsigned *ncores, unsigned *ncores_active,
1031                          unsigned *nthreads)
1032 {
1033     struct cpuinfo_x86 *c;
1034 
1035     *apicid = cpu_physical_id(cpu);
1036     c = &cpu_data[cpu];
1037     if ( c->apicid == BAD_APICID )
1038     {
1039         *chipid = cpu;
1040         *coreid = 0;
1041         *threadid = 0;
1042         if ( ncores != NULL )
1043             *ncores = 1;
1044         if ( ncores_active != NULL )
1045             *ncores_active = 1;
1046         if ( nthreads != NULL )
1047             *nthreads = 1;
1048     }
1049     else
1050     {
1051         *chipid = c->phys_proc_id;
1052         if ( c->x86_max_cores > 1 )
1053             *coreid = c->cpu_core_id;
1054         else
1055             *coreid = 0;
1056         *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
1057         if ( ncores != NULL )
1058             *ncores = c->x86_max_cores;
1059         if ( ncores_active != NULL )
1060             *ncores_active = c->booted_cores;
1061         if ( nthreads != NULL )
1062             *nthreads = c->x86_num_siblings;
1063     }
1064 }
1065 
1066 #define INTPOSE_NENT 50
1067 
1068 static struct intpose_ent {
1069     unsigned int cpu_nr;
1070     uint64_t msr;
1071     uint64_t val;
1072 } intpose_arr[INTPOSE_NENT];
1073 
intpose_init(void)1074 static void intpose_init(void)
1075 {
1076     static int done;
1077     int i;
1078 
1079     if ( done++ > 0 )
1080         return;
1081 
1082     for ( i = 0; i < INTPOSE_NENT; i++ )
1083         intpose_arr[i].cpu_nr = -1;
1084 
1085 }
1086 
intpose_lookup(unsigned int cpu_nr,uint64_t msr,uint64_t * valp)1087 struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
1088                                    uint64_t *valp)
1089 {
1090     int i;
1091 
1092     for ( i = 0; i < INTPOSE_NENT; i++ )
1093     {
1094         if ( intpose_arr[i].cpu_nr == cpu_nr && intpose_arr[i].msr == msr )
1095         {
1096             if ( valp != NULL )
1097                 *valp = intpose_arr[i].val;
1098             return &intpose_arr[i];
1099         }
1100     }
1101 
1102     return NULL;
1103 }
1104 
intpose_add(unsigned int cpu_nr,uint64_t msr,uint64_t val)1105 static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
1106 {
1107     struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL);
1108     int i;
1109 
1110     if ( ent )
1111     {
1112         ent->val = val;
1113         return;
1114     }
1115 
1116     for ( i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++ )
1117     {
1118         if ( ent->cpu_nr == -1 )
1119         {
1120             ent->cpu_nr = cpu_nr;
1121             ent->msr = msr;
1122             ent->val = val;
1123             return;
1124         }
1125     }
1126 
1127     printk("intpose_add: interpose array full - request dropped\n");
1128 }
1129 
intpose_inval(unsigned int cpu_nr,uint64_t msr)1130 bool intpose_inval(unsigned int cpu_nr, uint64_t msr)
1131 {
1132     struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL);
1133 
1134     if ( !ent )
1135         return false;
1136 
1137     ent->cpu_nr = -1;
1138     return true;
1139 }
1140 
1141 #define IS_MCA_BANKREG(r, cpu) \
1142     ((r) >= MSR_IA32_MC0_CTL && \
1143      (r) <= MSR_IA32_MCx_MISC(per_cpu(nr_mce_banks, cpu) - 1) && \
1144      ((r) - MSR_IA32_MC0_CTL) % 4) /* excludes MCi_CTL */
1145 
x86_mc_msrinject_verify(struct xen_mc_msrinject * mci)1146 static bool x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
1147 {
1148     const struct cpuinfo_x86 *c = &cpu_data[mci->mcinj_cpunr];
1149     int i, errs = 0;
1150 
1151     for ( i = 0; i < mci->mcinj_count; i++ )
1152     {
1153         uint64_t reg = mci->mcinj_msr[i].reg;
1154         const char *reason = NULL;
1155 
1156         if ( IS_MCA_BANKREG(reg, mci->mcinj_cpunr) )
1157         {
1158             if ( c->x86_vendor == X86_VENDOR_AMD )
1159             {
1160                 /*
1161                  * On AMD we can set MCi_STATUS_WREN in the
1162                  * HWCR MSR to allow non-zero writes to banks
1163                  * MSRs not to #GP.  The injector in dom0
1164                  * should set that bit, but we detect when it
1165                  * is necessary and set it as a courtesy to
1166                  * avoid #GP in the hypervisor.
1167                  */
1168                 mci->mcinj_flags |=
1169                     _MC_MSRINJ_F_REQ_HWCR_WREN;
1170                 continue;
1171             }
1172             else
1173             {
1174                 /*
1175                  * No alternative but to interpose, so require
1176                  * that the injector specified as such.
1177                  */
1178                 if ( !(mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) )
1179                     reason = "must specify interposition";
1180             }
1181         }
1182         else
1183         {
1184             switch ( reg )
1185             {
1186             /* MSRs acceptable on all x86 cpus */
1187             case MSR_IA32_MCG_STATUS:
1188                 break;
1189 
1190             case MSR_F10_MC4_MISC1:
1191             case MSR_F10_MC4_MISC2:
1192             case MSR_F10_MC4_MISC3:
1193                 if ( c->x86_vendor != X86_VENDOR_AMD )
1194                     reason = "only supported on AMD";
1195                 else if ( c->x86 < 0x10 )
1196                     reason = "only supported on AMD Fam10h+";
1197                 break;
1198 
1199             /* MSRs that the HV will take care of */
1200             case MSR_K8_HWCR:
1201                 if ( c->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
1202                     reason = "HV will operate HWCR";
1203                 else
1204                     reason = "only supported on AMD or Hygon";
1205                 break;
1206 
1207             default:
1208                 reason = "not a recognized MCA MSR";
1209                 break;
1210             }
1211         }
1212 
1213         if ( reason != NULL )
1214         {
1215             printk("HV MSR INJECT ERROR: MSR %#Lx %s\n",
1216                    (unsigned long long)mci->mcinj_msr[i].reg, reason);
1217             errs++;
1218         }
1219     }
1220 
1221     return !errs;
1222 }
1223 
x86_mc_hwcr_wren(void)1224 static uint64_t x86_mc_hwcr_wren(void)
1225 {
1226     uint64_t old;
1227 
1228     rdmsrl(MSR_K8_HWCR, old);
1229 
1230     if ( !(old & K8_HWCR_MCi_STATUS_WREN) )
1231     {
1232         uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
1233         wrmsrl(MSR_K8_HWCR, new);
1234     }
1235 
1236     return old;
1237 }
1238 
x86_mc_hwcr_wren_restore(uint64_t hwcr)1239 static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
1240 {
1241     if ( !(hwcr & K8_HWCR_MCi_STATUS_WREN) )
1242         wrmsrl(MSR_K8_HWCR, hwcr);
1243 }
1244 
x86_mc_msrinject(void * data)1245 static void x86_mc_msrinject(void *data)
1246 {
1247     struct xen_mc_msrinject *mci = data;
1248     struct mcinfo_msr *msr;
1249     uint64_t hwcr = 0;
1250     int intpose;
1251     int i;
1252 
1253     if ( mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN )
1254         hwcr = x86_mc_hwcr_wren();
1255 
1256     intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
1257 
1258     for ( i = 0, msr = &mci->mcinj_msr[0]; i < mci->mcinj_count; i++, msr++ )
1259     {
1260         printk("HV MSR INJECT (%s) target %u actual %u MSR %#Lx <-- %#Lx\n",
1261                intpose ? "interpose" : "hardware",
1262                mci->mcinj_cpunr, smp_processor_id(),
1263                (unsigned long long)msr->reg,
1264                (unsigned long long)msr->value);
1265 
1266         if ( intpose )
1267             intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
1268         else
1269             wrmsrl(msr->reg, msr->value);
1270     }
1271 
1272     if ( mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN )
1273         x86_mc_hwcr_wren_restore(hwcr);
1274 }
1275 
1276 /*ARGSUSED*/
x86_mc_mceinject(void * data)1277 static void x86_mc_mceinject(void *data)
1278 {
1279     printk("Simulating #MC on cpu %d\n", smp_processor_id());
1280     __asm__ __volatile__("int $0x12");
1281 }
1282 
1283 #if BITS_PER_LONG == 64
1284 
1285 #define ID2COOKIE(id) ((mctelem_cookie_t)(id))
1286 #define COOKIE2ID(c) ((uint64_t)(c))
1287 
1288 #elif defined(BITS_PER_LONG)
1289 #error BITS_PER_LONG has unexpected value
1290 #else
1291 #error BITS_PER_LONG definition absent
1292 #endif
1293 
1294 # include <compat/arch-x86/xen-mca.h>
1295 
1296 # define xen_mcinfo_msr              mcinfo_msr
1297 CHECK_mcinfo_msr;
1298 # undef xen_mcinfo_msr
1299 # undef CHECK_mcinfo_msr
1300 # define CHECK_mcinfo_msr            struct mcinfo_msr
1301 
1302 # define xen_mcinfo_common           mcinfo_common
1303 CHECK_mcinfo_common;
1304 # undef xen_mcinfo_common
1305 # undef CHECK_mcinfo_common
1306 # define CHECK_mcinfo_common         struct mcinfo_common
1307 
1308 CHECK_FIELD_(struct, mc_fetch, flags);
1309 CHECK_FIELD_(struct, mc_fetch, fetch_id);
1310 # define CHECK_compat_mc_fetch       struct mc_fetch
1311 
1312 CHECK_FIELD_(struct, mc_physcpuinfo, ncpus);
1313 # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo
1314 
1315 #define CHECK_compat_mc_inject_v2   struct mc_inject_v2
1316 CHECK_mc;
1317 # undef CHECK_compat_mc_fetch
1318 # undef CHECK_compat_mc_physcpuinfo
1319 
1320 # define xen_mc_info                 mc_info
1321 CHECK_mc_info;
1322 # undef xen_mc_info
1323 
1324 # define xen_mcinfo_global           mcinfo_global
1325 CHECK_mcinfo_global;
1326 # undef xen_mcinfo_global
1327 
1328 # define xen_mcinfo_bank             mcinfo_bank
1329 CHECK_mcinfo_bank;
1330 # undef xen_mcinfo_bank
1331 
1332 # define xen_mcinfo_extended         mcinfo_extended
1333 CHECK_mcinfo_extended;
1334 # undef xen_mcinfo_extended
1335 
1336 # define xen_mcinfo_recovery         mcinfo_recovery
1337 # define xen_cpu_offline_action      cpu_offline_action
1338 # define xen_page_offline_action     page_offline_action
1339 CHECK_mcinfo_recovery;
1340 # undef xen_cpu_offline_action
1341 # undef xen_page_offline_action
1342 # undef xen_mcinfo_recovery
1343 
1344 /* Machine Check Architecture Hypercall */
do_mca(XEN_GUEST_HANDLE_PARAM (xen_mc_t)u_xen_mc)1345 long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc)
1346 {
1347     long ret = 0;
1348     struct xen_mc curop, *op = &curop;
1349     struct vcpu *v = current;
1350     union {
1351         struct xen_mc_fetch *nat;
1352         struct compat_mc_fetch *cmp;
1353     } mc_fetch;
1354     union {
1355         struct xen_mc_physcpuinfo *nat;
1356         struct compat_mc_physcpuinfo *cmp;
1357     } mc_physcpuinfo;
1358     uint32_t flags, cmdflags;
1359     int nlcpu;
1360     mctelem_cookie_t mctc;
1361     mctelem_class_t which;
1362     unsigned int target;
1363     struct xen_mc_msrinject *mc_msrinject;
1364     struct xen_mc_mceinject *mc_mceinject;
1365 
1366     ret = xsm_do_mca(XSM_PRIV);
1367     if ( ret )
1368         return x86_mcerr("", ret);
1369 
1370     if ( copy_from_guest(op, u_xen_mc, 1) )
1371         return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
1372 
1373     if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
1374         return x86_mcerr("do_mca: interface version mismatch", -EACCES);
1375 
1376     switch ( op->cmd )
1377     {
1378     case XEN_MC_fetch:
1379         mc_fetch.nat = &op->u.mc_fetch;
1380         cmdflags = mc_fetch.nat->flags;
1381 
1382         switch ( cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT) )
1383         {
1384         case XEN_MC_NONURGENT:
1385             which = MC_NONURGENT;
1386             break;
1387 
1388         case XEN_MC_URGENT:
1389             which = MC_URGENT;
1390             break;
1391 
1392         default:
1393             return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
1394         }
1395 
1396         flags = XEN_MC_OK;
1397 
1398         if ( cmdflags & XEN_MC_ACK )
1399         {
1400             mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
1401             mctelem_ack(which, cookie);
1402         }
1403         else
1404         {
1405             if ( !is_pv_32bit_vcpu(v)
1406                  ? guest_handle_is_null(mc_fetch.nat->data)
1407                  : compat_handle_is_null(mc_fetch.cmp->data) )
1408                 return x86_mcerr("do_mca fetch: guest buffer "
1409                                  "invalid", -EINVAL);
1410 
1411             mctc = mctelem_consume_oldest_begin(which);
1412             if ( mctc )
1413             {
1414                 struct mc_info *mcip = mctelem_dataptr(mctc);
1415                 if ( !is_pv_32bit_vcpu(v)
1416                      ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
1417                      : copy_to_compat(mc_fetch.cmp->data, mcip, 1) )
1418                 {
1419                     ret = -EFAULT;
1420                     flags |= XEN_MC_FETCHFAILED;
1421                     mc_fetch.nat->fetch_id = 0;
1422                 }
1423                 else
1424                     mc_fetch.nat->fetch_id = COOKIE2ID(mctc);
1425                 mctelem_consume_oldest_end(mctc);
1426             }
1427             else
1428             {
1429                 /* There is no data */
1430                 flags |= XEN_MC_NODATA;
1431                 mc_fetch.nat->fetch_id = 0;
1432             }
1433 
1434             mc_fetch.nat->flags = flags;
1435             if (copy_to_guest(u_xen_mc, op, 1) != 0)
1436                 ret = -EFAULT;
1437         }
1438 
1439         break;
1440 
1441     case XEN_MC_notifydomain:
1442         return x86_mcerr("do_mca notify unsupported", -EINVAL);
1443 
1444     case XEN_MC_physcpuinfo:
1445         mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
1446         nlcpu = num_online_cpus();
1447 
1448         if ( !is_pv_32bit_vcpu(v)
1449              ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
1450              : !compat_handle_is_null(mc_physcpuinfo.cmp->info) )
1451         {
1452             xen_mc_logical_cpu_t *log_cpus;
1453 
1454             if ( mc_physcpuinfo.nat->ncpus <= 0 )
1455                 return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
1456                                  -EINVAL);
1457             nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus);
1458             log_cpus = xzalloc_array(xen_mc_logical_cpu_t, nlcpu);
1459             if ( log_cpus == NULL )
1460                 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
1461             on_each_cpu(do_mc_get_cpu_info, log_cpus, 1);
1462             if ( !is_pv_32bit_vcpu(v)
1463                  ? copy_to_guest(mc_physcpuinfo.nat->info, log_cpus, nlcpu)
1464                  : copy_to_compat(mc_physcpuinfo.cmp->info, log_cpus, nlcpu) )
1465                 ret = -EFAULT;
1466             xfree(log_cpus);
1467         }
1468 
1469         mc_physcpuinfo.nat->ncpus = nlcpu;
1470 
1471         if ( copy_to_guest(u_xen_mc, op, 1) )
1472             return x86_mcerr("do_mca cpuinfo", -EFAULT);
1473 
1474         break;
1475 
1476     case XEN_MC_msrinject:
1477         if ( !mca_allbanks || !mca_allbanks->num )
1478             return x86_mcerr("do_mca inject", -ENODEV);
1479 
1480         mc_msrinject = &op->u.mc_msrinject;
1481         target = mc_msrinject->mcinj_cpunr;
1482 
1483         if ( target >= nr_cpu_ids )
1484             return x86_mcerr("do_mca inject: bad target", -EINVAL);
1485 
1486         if ( !cpu_online(target) )
1487             return x86_mcerr("do_mca inject: target offline",
1488                              -EINVAL);
1489 
1490         if ( !per_cpu(nr_mce_banks, target) )
1491             return x86_mcerr("do_mca inject: no banks", -ENOENT);
1492 
1493         if ( mc_msrinject->mcinj_count == 0 )
1494             return 0;
1495 
1496         if ( mc_msrinject->mcinj_flags & MC_MSRINJ_F_GPADDR )
1497         {
1498             domid_t domid;
1499             struct domain *d;
1500             struct mcinfo_msr *msr;
1501             unsigned int i;
1502             paddr_t gaddr;
1503             unsigned long gfn, mfn;
1504             p2m_type_t t;
1505 
1506             domid = (mc_msrinject->mcinj_domid == DOMID_SELF) ?
1507                     current->domain->domain_id : mc_msrinject->mcinj_domid;
1508             if ( domid >= DOMID_FIRST_RESERVED )
1509                 return x86_mcerr("do_mca inject: incompatible flag "
1510                                  "MC_MSRINJ_F_GPADDR with domain %d",
1511                                  -EINVAL, domid);
1512 
1513             d = get_domain_by_id(domid);
1514             if ( d == NULL )
1515                 return x86_mcerr("do_mca inject: bad domain id %d",
1516                                  -EINVAL, domid);
1517 
1518             for ( i = 0, msr = &mc_msrinject->mcinj_msr[0];
1519                   i < mc_msrinject->mcinj_count;
1520                   i++, msr++ )
1521             {
1522                 gaddr = msr->value;
1523                 gfn = PFN_DOWN(gaddr);
1524                 mfn = mfn_x(get_gfn(d, gfn, &t));
1525 
1526                 if ( mfn == mfn_x(INVALID_MFN) )
1527                 {
1528                     put_gfn(d, gfn);
1529                     put_domain(d);
1530                     return x86_mcerr("do_mca inject: bad gfn %#lx of domain %d",
1531                                      -EINVAL, gfn, domid);
1532                 }
1533 
1534                 msr->value = pfn_to_paddr(mfn) | (gaddr & (PAGE_SIZE - 1));
1535 
1536                 put_gfn(d, gfn);
1537             }
1538 
1539             put_domain(d);
1540         }
1541 
1542         if ( !x86_mc_msrinject_verify(mc_msrinject) )
1543             return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
1544 
1545         add_taint(TAINT_ERROR_INJECT);
1546 
1547         on_selected_cpus(cpumask_of(target), x86_mc_msrinject,
1548                          mc_msrinject, 1);
1549 
1550         break;
1551 
1552     case XEN_MC_mceinject:
1553         if ( !mca_allbanks || !mca_allbanks->num )
1554             return x86_mcerr("do_mca #MC", -ENODEV);
1555 
1556         mc_mceinject = &op->u.mc_mceinject;
1557         target = mc_mceinject->mceinj_cpunr;
1558 
1559         if ( target >= nr_cpu_ids )
1560             return x86_mcerr("do_mca #MC: bad target", -EINVAL);
1561 
1562         if ( !cpu_online(target) )
1563             return x86_mcerr("do_mca #MC: target offline", -EINVAL);
1564 
1565         if ( !per_cpu(nr_mce_banks, target) )
1566             return x86_mcerr("do_mca #MC: no banks", -ENOENT);
1567 
1568         add_taint(TAINT_ERROR_INJECT);
1569 
1570         if ( mce_broadcast )
1571             on_each_cpu(x86_mc_mceinject, mc_mceinject, 1);
1572         else
1573             on_selected_cpus(cpumask_of(target), x86_mc_mceinject,
1574                              mc_mceinject, 1);
1575         break;
1576 
1577     case XEN_MC_inject_v2:
1578     {
1579         const cpumask_t *cpumap;
1580         cpumask_var_t cmv;
1581         bool broadcast = op->u.mc_inject_v2.flags & XEN_MC_INJECT_CPU_BROADCAST;
1582 
1583         if ( !mca_allbanks || !mca_allbanks->num )
1584             return x86_mcerr("do_mca #MC", -ENODEV);
1585 
1586         if ( broadcast )
1587             cpumap = &cpu_online_map;
1588         else
1589         {
1590             ret = xenctl_bitmap_to_cpumask(&cmv, &op->u.mc_inject_v2.cpumap);
1591             if ( ret )
1592                 break;
1593             cpumap = cmv;
1594             if ( !cpumask_intersects(cpumap, &cpu_online_map) )
1595             {
1596                 free_cpumask_var(cmv);
1597                 ret = x86_mcerr("No online CPU passed\n", -EINVAL);
1598                 break;
1599             }
1600             if ( !cpumask_subset(cpumap, &cpu_online_map) )
1601                 dprintk(XENLOG_INFO,
1602                         "Not all required CPUs are online\n");
1603         }
1604 
1605         for_each_cpu(target, cpumap)
1606             if ( cpu_online(target) && !per_cpu(nr_mce_banks, target) )
1607             {
1608                 ret = x86_mcerr("do_mca #MC: CPU%u has no banks",
1609                                 -ENOENT, target);
1610                 break;
1611             }
1612         if ( ret )
1613             break;
1614 
1615         switch ( op->u.mc_inject_v2.flags & XEN_MC_INJECT_TYPE_MASK )
1616         {
1617         case XEN_MC_INJECT_TYPE_MCE:
1618             if ( mce_broadcast &&
1619                  !cpumask_equal(cpumap, &cpu_online_map) )
1620                 printk("Not trigger MCE on all CPUs, may HANG!\n");
1621             on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1);
1622             break;
1623 
1624         case XEN_MC_INJECT_TYPE_CMCI:
1625             if ( !cmci_apic_vector )
1626                 ret = x86_mcerr("No CMCI supported in platform\n", -EINVAL);
1627             else
1628             {
1629                 if ( cpumask_test_cpu(smp_processor_id(), cpumap) )
1630                     send_IPI_self(cmci_apic_vector);
1631                 send_IPI_mask(cpumap, cmci_apic_vector);
1632             }
1633             break;
1634 
1635         case XEN_MC_INJECT_TYPE_LMCE:
1636             if ( !lmce_support )
1637             {
1638                 ret = x86_mcerr("No LMCE support", -EINVAL);
1639                 break;
1640             }
1641             if ( broadcast )
1642             {
1643                 ret = x86_mcerr("Broadcast cannot be used with LMCE", -EINVAL);
1644                 break;
1645             }
1646             /* Ensure at most one CPU is specified. */
1647             if ( nr_cpu_ids > cpumask_next(cpumask_first(cpumap), cpumap) )
1648             {
1649                 ret = x86_mcerr("More than one CPU specified for LMCE",
1650                                 -EINVAL);
1651                 break;
1652             }
1653             on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1);
1654             break;
1655 
1656         default:
1657             ret = x86_mcerr("Wrong mca type\n", -EINVAL);
1658             break;
1659         }
1660 
1661         if ( cpumap != &cpu_online_map )
1662             free_cpumask_var(cmv);
1663 
1664         break;
1665     }
1666 
1667     default:
1668         return x86_mcerr("do_mca: bad command", -EINVAL);
1669     }
1670 
1671     return ret;
1672 }
1673 
1674 int mcinfo_dumpped;
x86_mcinfo_dump_panic(mctelem_cookie_t mctc)1675 static int x86_mcinfo_dump_panic(mctelem_cookie_t mctc)
1676 {
1677     struct mc_info *mcip = mctelem_dataptr(mctc);
1678 
1679     x86_mcinfo_dump(mcip);
1680     mcinfo_dumpped++;
1681 
1682     return 0;
1683 }
1684 
1685 /* XXX shall we dump commited mc_info?? */
mc_panic_dump(void)1686 static void mc_panic_dump(void)
1687 {
1688     int cpu;
1689 
1690     dprintk(XENLOG_ERR, "Begin dump mc_info\n");
1691     for_each_online_cpu(cpu)
1692         mctelem_process_deferred(cpu, x86_mcinfo_dump_panic,
1693                                  mctelem_has_deferred_lmce(cpu));
1694     dprintk(XENLOG_ERR, "End dump mc_info, %x mcinfo dumped\n", mcinfo_dumpped);
1695 }
1696 
mc_panic(char * s)1697 void mc_panic(char *s)
1698 {
1699     is_mc_panic = true;
1700     console_force_unlock();
1701 
1702     printk("Fatal machine check: %s\n", s);
1703     printk("\n"
1704            "****************************************\n"
1705            "\n"
1706            "   The processor has reported a hardware error which cannot\n"
1707            "   be recovered from.  Xen will now reboot the machine.\n");
1708     mc_panic_dump();
1709     panic("HARDWARE ERROR\n");
1710 }
1711 
1712 /*
1713  * Machine Check owner judge algorithm:
1714  * When error happens, all cpus serially read its msr banks.
1715  * The first CPU who fetches the error bank's info will clear
1716  * this bank. Later readers can't get any information again.
1717  * The first CPU is the actual mce_owner
1718  *
1719  * For Fatal (pcc=1) error, it might cause machine crash
1720  * before we're able to log. For avoiding log missing, we adopt two
1721  * round scanning:
1722  * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
1723  * All MCE banks are sticky, when boot up, MCE polling mechanism
1724  * will help to collect and log those MCE errors.
1725  * Round2: Do all MCE processing logic as normal.
1726  */
1727 
1728 /* Maybe called in MCE context, no lock, no printk */
mce_action(const struct cpu_user_regs * regs,mctelem_cookie_t mctc)1729 static enum mce_result mce_action(const struct cpu_user_regs *regs,
1730                                   mctelem_cookie_t mctc)
1731 {
1732     struct mc_info *local_mi;
1733     enum mce_result bank_result = MCER_NOERROR;
1734     enum mce_result worst_result = MCER_NOERROR;
1735     struct mcinfo_common *mic = NULL;
1736     struct mca_binfo binfo;
1737     const struct mca_error_handler *handlers = mce_dhandlers;
1738     unsigned int i, handler_num = mce_dhandler_num;
1739 
1740     /* When in mce context, regs is valid */
1741     if ( regs )
1742     {
1743         handler_num = mce_uhandler_num;
1744         handlers = mce_uhandlers;
1745     }
1746 
1747     local_mi = (struct mc_info *)mctelem_dataptr(mctc);
1748     x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
1749     if ( mic == NULL )
1750     {
1751         printk(KERN_ERR "MCE: get local buffer entry failed\n ");
1752         return MCER_CONTINUE;
1753     }
1754 
1755     memset(&binfo, 0, sizeof(binfo));
1756     binfo.mig = (struct mcinfo_global *)mic;
1757     binfo.mi = local_mi;
1758 
1759     /* Processing bank information */
1760     x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
1761 
1762     for ( ; bank_result != MCER_RESET && mic && mic->size;
1763           mic = x86_mcinfo_next(mic) )
1764     {
1765         if ( mic->type != MC_TYPE_BANK )
1766         {
1767             continue;
1768         }
1769         binfo.mib = (struct mcinfo_bank *)mic;
1770         binfo.bank = binfo.mib->mc_bank;
1771         bank_result = MCER_NOERROR;
1772         for ( i = 0; i < handler_num; i++ )
1773         {
1774             if ( handlers[i].owned_error(binfo.mib->mc_status) )
1775             {
1776                 handlers[i].recovery_handler(&binfo, &bank_result, regs);
1777                 if ( worst_result < bank_result )
1778                     worst_result = bank_result;
1779                 break;
1780             }
1781         }
1782     }
1783 
1784     return worst_result;
1785 }
1786 
1787 /*
1788  * Called from mctelem_process_deferred. Return 1 if the telemetry
1789  * should be committed for dom0 consumption, 0 if it should be
1790  * dismissed.
1791  */
mce_delayed_action(mctelem_cookie_t mctc)1792 static int mce_delayed_action(mctelem_cookie_t mctc)
1793 {
1794     enum mce_result result;
1795     int ret = 0;
1796 
1797     result = mce_action(NULL, mctc);
1798 
1799     switch ( result )
1800     {
1801     case MCER_RESET:
1802         dprintk(XENLOG_ERR, "MCE delayed action failed\n");
1803         is_mc_panic = true;
1804         x86_mcinfo_dump(mctelem_dataptr(mctc));
1805         panic("MCE: Software recovery failed for the UCR\n");
1806         break;
1807 
1808     case MCER_RECOVERED:
1809         dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
1810         ret = 1;
1811         break;
1812 
1813     case MCER_CONTINUE:
1814         dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
1815                 "system is tainted\n");
1816         x86_mcinfo_dump(mctelem_dataptr(mctc));
1817         ret = 1;
1818         break;
1819 
1820     default:
1821         ret = 0;
1822         break;
1823     }
1824     return ret;
1825 }
1826 
1827 /* Softirq Handler for this MCE# processing */
mce_softirq(void)1828 static void mce_softirq(void)
1829 {
1830     static DEFINE_MCE_BARRIER(mce_inside_bar);
1831     static DEFINE_MCE_BARRIER(mce_severity_bar);
1832     static atomic_t severity_cpu;
1833     int cpu = smp_processor_id();
1834     unsigned int workcpu;
1835     bool lmce = mctelem_has_deferred_lmce(cpu);
1836     bool bcast = mce_broadcast && !lmce;
1837 
1838     mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
1839 
1840     mce_barrier_enter(&mce_inside_bar, bcast);
1841 
1842     if ( !lmce )
1843     {
1844         /*
1845          * Everybody is here. Now let's see who gets to do the
1846          * recovery work. Right now we just see if there's a CPU
1847          * that did not have any problems, and pick that one.
1848          *
1849          * First, just set a default value: the last CPU who reaches this
1850          * will overwrite the value and become the default.
1851          */
1852 
1853         atomic_set(&severity_cpu, cpu);
1854 
1855         mce_barrier_enter(&mce_severity_bar, bcast);
1856         if ( !mctelem_has_deferred(cpu) )
1857             atomic_set(&severity_cpu, cpu);
1858         mce_barrier_exit(&mce_severity_bar, bcast);
1859     }
1860 
1861     /* We choose severity_cpu for further processing */
1862     if ( lmce || atomic_read(&severity_cpu) == cpu )
1863     {
1864 
1865         mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
1866 
1867         /*
1868          * Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
1869          * vMCE MSRs virtualization buffer
1870          */
1871 
1872         if ( lmce )
1873             mctelem_process_deferred(cpu, mce_delayed_action, true);
1874         else
1875             for_each_online_cpu(workcpu)
1876                 mctelem_process_deferred(workcpu, mce_delayed_action, false);
1877 
1878         /* Step2: Send Log to DOM0 through vIRQ */
1879         if ( dom0_vmce_enabled() )
1880         {
1881             mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
1882             send_global_virq(VIRQ_MCA);
1883         }
1884     }
1885 
1886     mce_barrier_exit(&mce_inside_bar, bcast);
1887 }
1888 
1889 /*
1890  * Machine Check owner judge algorithm:
1891  * When error happens, all cpus serially read its msr banks.
1892  * The first CPU who fetches the error bank's info will clear
1893  * this bank. Later readers can't get any infor again.
1894  * The first CPU is the actual mce_owner
1895  *
1896  * For Fatal (pcc=1) error, it might cause machine crash
1897  * before we're able to log. For avoiding log missing, we adopt two
1898  * round scanning:
1899  * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
1900  * All MCE banks are sticky, when boot up, MCE polling mechanism
1901  * will help to collect and log those MCE errors.
1902  * Round2: Do all MCE processing logic as normal.
1903  */
mce_handler_init(void)1904 void mce_handler_init(void)
1905 {
1906     if ( smp_processor_id() != 0 )
1907         return;
1908 
1909     /* callback register, do we really need so many callback? */
1910     /* mce handler data initialization */
1911     spin_lock_init(&mce_logout_lock);
1912     open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
1913 }
1914