1 /*
2  * vmce.c - provide software emulated vMCE support to guest
3  *
4  * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@intel.com>
5  * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@intel.com>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include <xen/init.h>
22 #include <xen/types.h>
23 #include <xen/irq.h>
24 #include <xen/event.h>
25 #include <xen/kernel.h>
26 #include <xen/delay.h>
27 #include <xen/smp.h>
28 #include <xen/mm.h>
29 #include <asm/hvm/save.h>
30 #include <asm/processor.h>
31 #include <public/hvm/params.h>
32 #include <public/sysctl.h>
33 #include <asm/system.h>
34 #include <asm/msr.h>
35 #include <asm/p2m.h>
36 #include <asm/pv/traps.h>
37 
38 #include "mce.h"
39 #include "x86_mca.h"
40 #include "vmce.h"
41 
42 /*
43  * MCG_SER_P:  software error recovery supported
44  * MCG_TES_P:  to avoid MCi_status bit56:53 model specific
45  * MCG_CMCI_P: expose CMCI capability but never really inject it to guest,
46  *             for sake of performance since guest not polling periodically
47  */
48 #define INTEL_GUEST_MCG_CAP (MCG_SER_P |	\
49                              MCG_TES_P |	\
50                              MCG_CMCI_P |	\
51                              GUEST_MC_BANK_NUM)
52 
53 #define AMD_GUEST_MCG_CAP GUEST_MC_BANK_NUM
54 
vmce_init_vcpu(struct vcpu * v)55 void vmce_init_vcpu(struct vcpu *v)
56 {
57     int i;
58 
59     /* global MCA MSRs init */
60     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
61         v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP;
62     else
63         v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP;
64 
65     v->arch.vmce.mcg_status = 0;
66 
67     /* per-bank MCA MSRs init */
68     for ( i = 0; i < GUEST_MC_BANK_NUM; i++ )
69         memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank));
70 
71     spin_lock_init(&v->arch.vmce.lock);
72 }
73 
vmce_restore_vcpu(struct vcpu * v,const struct hvm_vmce_vcpu * ctxt)74 int vmce_restore_vcpu(struct vcpu *v, const struct hvm_vmce_vcpu *ctxt)
75 {
76     unsigned long guest_mcg_cap;
77 
78     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
79         guest_mcg_cap = INTEL_GUEST_MCG_CAP | MCG_LMCE_P;
80     else
81         guest_mcg_cap = AMD_GUEST_MCG_CAP;
82 
83     if ( ctxt->caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P )
84     {
85         dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities"
86                 " %#" PRIx64 " for %pv (supported: %#Lx)\n",
87                 is_hvm_vcpu(v) ? "HVM" : "PV", ctxt->caps,
88                 v, guest_mcg_cap & ~MCG_CAP_COUNT);
89         return -EPERM;
90     }
91 
92     v->arch.vmce.mcg_cap = ctxt->caps;
93     v->arch.vmce.bank[0].mci_ctl2 = ctxt->mci_ctl2_bank0;
94     v->arch.vmce.bank[1].mci_ctl2 = ctxt->mci_ctl2_bank1;
95     v->arch.vmce.mcg_ext_ctl = ctxt->mcg_ext_ctl;
96 
97     return 0;
98 }
99 
100 /*
101  * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
102  * when migrating from old vMCE version to new vMCE.
103  */
bank_mce_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)104 static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
105 {
106     int ret = 1;
107     unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
108 
109     *val = 0;
110 
111     switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
112     {
113     case MSR_IA32_MC0_CTL:
114         /* stick all 1's to MCi_CTL */
115         *val = ~0UL;
116         mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_CTL %#"PRIx64"\n",
117                    v, bank, *val);
118         break;
119 
120     case MSR_IA32_MC0_STATUS:
121         if ( bank < GUEST_MC_BANK_NUM )
122         {
123             *val = v->arch.vmce.bank[bank].mci_status;
124             if ( *val )
125                 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_STATUS %#"PRIx64"\n",
126                            v, bank, *val);
127         }
128         break;
129 
130     case MSR_IA32_MC0_ADDR:
131         if ( bank < GUEST_MC_BANK_NUM )
132         {
133             *val = v->arch.vmce.bank[bank].mci_addr;
134             if ( *val )
135                 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_ADDR %#"PRIx64"\n",
136                            v, bank, *val);
137         }
138         break;
139 
140     case MSR_IA32_MC0_MISC:
141         if ( bank < GUEST_MC_BANK_NUM )
142         {
143             *val = v->arch.vmce.bank[bank].mci_misc;
144             if ( *val )
145                 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_MISC %#"PRIx64"\n",
146                            v, bank, *val);
147         }
148         break;
149 
150     default:
151         switch ( boot_cpu_data.x86_vendor )
152         {
153         case X86_VENDOR_INTEL:
154             ret = vmce_intel_rdmsr(v, msr, val);
155             break;
156 
157         case X86_VENDOR_AMD:
158         case X86_VENDOR_HYGON:
159             ret = vmce_amd_rdmsr(v, msr, val);
160             break;
161 
162         default:
163             ret = 0;
164             break;
165         }
166         break;
167     }
168 
169     return ret;
170 }
171 
172 /*
173  * < 0: Unsupported and will #GP fault to guest
174  * = 0: Not handled, should be handled by other components
175  * > 0: Success
176  */
vmce_rdmsr(uint32_t msr,uint64_t * val)177 int vmce_rdmsr(uint32_t msr, uint64_t *val)
178 {
179     struct vcpu *cur = current;
180     int ret = 1;
181 
182     *val = 0;
183 
184     spin_lock(&cur->arch.vmce.lock);
185 
186     switch ( msr )
187     {
188     case MSR_IA32_MCG_STATUS:
189         *val = cur->arch.vmce.mcg_status;
190         if ( *val )
191             mce_printk(MCE_VERBOSE,
192                        "MCE: %pv: rd MCG_STATUS %#"PRIx64"\n", cur, *val);
193         break;
194 
195     case MSR_IA32_MCG_CAP:
196         *val = cur->arch.vmce.mcg_cap;
197         mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CAP %#"PRIx64"\n", cur, *val);
198         break;
199 
200     case MSR_IA32_MCG_CTL:
201         if ( cur->arch.vmce.mcg_cap & MCG_CTL_P )
202             *val = ~0ULL;
203         mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CTL %#"PRIx64"\n", cur, *val);
204         break;
205 
206     case MSR_IA32_MCG_EXT_CTL:
207         /*
208          * If MCG_LMCE_P is present in guest MSR_IA32_MCG_CAP, the LMCE and LOCK
209          * bits are always set in guest MSR_IA32_FEATURE_CONTROL by Xen, so it
210          * does not need to check them here.
211          */
212         if ( cur->arch.vmce.mcg_cap & MCG_LMCE_P )
213         {
214             *val = cur->arch.vmce.mcg_ext_ctl;
215             mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL %#"PRIx64"\n",
216                        cur, *val);
217         }
218         else
219         {
220             ret = -1;
221             mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL, not supported\n",
222                        cur);
223         }
224         break;
225 
226     default:
227         ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0;
228         break;
229     }
230 
231     spin_unlock(&cur->arch.vmce.lock);
232 
233     return ret;
234 }
235 
236 /*
237  * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
238  * when migratie from old vMCE version to new vMCE.
239  */
bank_mce_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)240 static int bank_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
241 {
242     int ret = 1;
243     unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
244 
245     switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
246     {
247     case MSR_IA32_MC0_CTL:
248         /*
249          * if guest crazy clear any bit of MCi_CTL,
250          * treat it as not implement and ignore write change it.
251          */
252         break;
253 
254     case MSR_IA32_MC0_STATUS:
255         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_STATUS %#"PRIx64"\n",
256                    v, bank, val);
257         if ( val )
258             ret = -1;
259         else if ( bank < GUEST_MC_BANK_NUM )
260             v->arch.vmce.bank[bank].mci_status = val;
261         break;
262 
263     case MSR_IA32_MC0_ADDR:
264         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_ADDR %#"PRIx64"\n",
265                    v, bank, val);
266         if ( val )
267             ret = -1;
268         else if ( bank < GUEST_MC_BANK_NUM )
269             v->arch.vmce.bank[bank].mci_addr = val;
270         break;
271 
272     case MSR_IA32_MC0_MISC:
273         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_MISC %#"PRIx64"\n",
274                    v, bank, val);
275         if ( val )
276             ret = -1;
277         else if ( bank < GUEST_MC_BANK_NUM )
278             v->arch.vmce.bank[bank].mci_misc = val;
279         break;
280 
281     default:
282         switch ( boot_cpu_data.x86_vendor )
283         {
284         case X86_VENDOR_INTEL:
285             ret = vmce_intel_wrmsr(v, msr, val);
286             break;
287 
288         case X86_VENDOR_AMD:
289         case X86_VENDOR_HYGON:
290             ret = vmce_amd_wrmsr(v, msr, val);
291             break;
292 
293         default:
294             ret = 0;
295             break;
296         }
297         break;
298     }
299 
300     return ret;
301 }
302 
303 /*
304  * < 0: Unsupported and will #GP fault to guest
305  * = 0: Not handled, should be handled by other components
306  * > 0: Success
307  */
vmce_wrmsr(uint32_t msr,uint64_t val)308 int vmce_wrmsr(uint32_t msr, uint64_t val)
309 {
310     struct vcpu *cur = current;
311     int ret = 1;
312 
313     spin_lock(&cur->arch.vmce.lock);
314 
315     switch ( msr )
316     {
317     case MSR_IA32_MCG_CTL:
318         /* If MCG_CTL exists then stick to all 1's, else ignore. */
319         break;
320 
321     case MSR_IA32_MCG_STATUS:
322         cur->arch.vmce.mcg_status = val;
323         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_STATUS %"PRIx64"\n",
324                    cur, val);
325         break;
326 
327     case MSR_IA32_MCG_CAP:
328         /*
329          * According to Intel SDM, IA32_MCG_CAP is a read-only register,
330          * the effect of writing to the IA32_MCG_CAP is undefined. Here we
331          * treat writing as 'write not change'. Guest would not surprise.
332          */
333         mce_printk(MCE_VERBOSE, "MCE: %pv: MCG_CAP is r/o\n", cur);
334         break;
335 
336     case MSR_IA32_MCG_EXT_CTL:
337         if ( (cur->arch.vmce.mcg_cap & MCG_LMCE_P) &&
338              !(val & ~MCG_EXT_CTL_LMCE_EN) )
339             cur->arch.vmce.mcg_ext_ctl = val;
340         else
341             ret = -1;
342         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_EXT_CTL %"PRIx64"%s\n",
343                    cur, val, (ret == -1) ? ", not supported" : "");
344         break;
345 
346     default:
347         ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
348         break;
349     }
350 
351     spin_unlock(&cur->arch.vmce.lock);
352     return ret;
353 }
354 
355 #if CONFIG_HVM
vmce_save_vcpu_ctxt(struct vcpu * v,hvm_domain_context_t * h)356 static int vmce_save_vcpu_ctxt(struct vcpu *v, hvm_domain_context_t *h)
357 {
358     struct hvm_vmce_vcpu ctxt = {
359         .caps = v->arch.vmce.mcg_cap,
360         .mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2,
361         .mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2,
362         .mcg_ext_ctl = v->arch.vmce.mcg_ext_ctl,
363     };
364 
365     return hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
366 }
367 
vmce_load_vcpu_ctxt(struct domain * d,hvm_domain_context_t * h)368 static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
369 {
370     unsigned int vcpuid = hvm_load_instance(h);
371     struct vcpu *v;
372     struct hvm_vmce_vcpu ctxt;
373     int err;
374 
375     if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
376     {
377         dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
378                 d->domain_id, vcpuid);
379         err = -EINVAL;
380     }
381     else
382         err = hvm_load_entry_zeroextend(VMCE_VCPU, h, &ctxt);
383 
384     return err ?: vmce_restore_vcpu(v, &ctxt);
385 }
386 
387 HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
388                           vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
389 #endif
390 
391 /*
392  * for Intel MCE, broadcast vMCE to all vcpus
393  * for AMD MCE, only inject vMCE to vcpu0
394  *
395  * @ d, domain to which would inject vmce
396  * @ vcpu,
397  *   -1 (VMCE_INJECT_BROADCAST), broadcast vMCE to all vcpus
398  *   >= 0, vcpu, the vMCE is injected to
399  */
inject_vmce(struct domain * d,int vcpu)400 int inject_vmce(struct domain *d, int vcpu)
401 {
402     struct vcpu *v;
403     int ret = -ESRCH;
404 
405     for_each_vcpu ( d, v )
406     {
407         if ( vcpu != VMCE_INJECT_BROADCAST && vcpu != v->vcpu_id )
408             continue;
409 
410         /* Don't inject to uninitialized VCPU. */
411         if ( !v->is_initialised )
412             continue;
413 
414         if ( (is_hvm_domain(d) ||
415               pv_trap_callback_registered(v, TRAP_machine_check)) &&
416              !test_and_set_bool(v->arch.mce_pending) )
417         {
418             mce_printk(MCE_VERBOSE, "MCE: inject vMCE to %pv\n", v);
419             vcpu_kick(v);
420             ret = 0;
421         }
422         else
423         {
424             mce_printk(MCE_QUIET, "Failed to inject vMCE to %pv\n", v);
425             ret = -EBUSY;
426             break;
427         }
428 
429         if ( vcpu != VMCE_INJECT_BROADCAST )
430             break;
431     }
432 
433     return ret;
434 }
435 
vcpu_fill_mc_msrs(struct vcpu * v,uint64_t mcg_status,uint64_t mci_status,uint64_t mci_addr,uint64_t mci_misc)436 static int vcpu_fill_mc_msrs(struct vcpu *v, uint64_t mcg_status,
437                              uint64_t mci_status, uint64_t mci_addr,
438                              uint64_t mci_misc)
439 {
440     if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP )
441     {
442         mce_printk(MCE_QUIET, "MCE: %pv: guest has not handled previous"
443                    " vMCE yet!\n", v);
444         return -EBUSY;
445     }
446 
447     spin_lock(&v->arch.vmce.lock);
448 
449     v->arch.vmce.mcg_status = mcg_status;
450     /*
451      * 1. Skip bank 0 to avoid 'bank 0 quirk' of old processors
452      * 2. Filter MCi_STATUS MSCOD model specific error code to guest
453      */
454     v->arch.vmce.bank[1].mci_status = mci_status & MCi_STATUS_MSCOD_MASK;
455     v->arch.vmce.bank[1].mci_addr = mci_addr;
456     v->arch.vmce.bank[1].mci_misc = mci_misc;
457 
458     spin_unlock(&v->arch.vmce.lock);
459 
460     return 0;
461 }
462 
fill_vmsr_data(struct mcinfo_bank * mc_bank,struct domain * d,uint64_t gstatus,int vmce_vcpuid)463 int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
464                    uint64_t gstatus, int vmce_vcpuid)
465 {
466     struct vcpu *v = d->vcpu[0];
467     bool broadcast = (vmce_vcpuid == VMCE_INJECT_BROADCAST);
468     int ret, err;
469 
470     if ( mc_bank->mc_domid == DOMID_INVALID )
471         return -EINVAL;
472 
473     if ( broadcast )
474         gstatus &= ~MCG_STATUS_LMCE;
475     else if ( gstatus & MCG_STATUS_LMCE )
476     {
477         ASSERT(vmce_vcpuid >= 0 && vmce_vcpuid < d->max_vcpus);
478         v = d->vcpu[vmce_vcpuid];
479     }
480 
481     /*
482      * vMCE with the actual error information is injected to vCPU0,
483      * and, if broadcast is required, we choose to inject less severe
484      * vMCEs to other vCPUs. Thus guest can always get the severest
485      * error (i.e. the actual one) on vCPU0. If guest can recover from
486      * the severest error on vCPU0, the less severe errors on other
487      * vCPUs will not prevent guest from recovering on those vCPUs.
488      */
489     ret = vcpu_fill_mc_msrs(v, gstatus, mc_bank->mc_status,
490                             mc_bank->mc_addr, mc_bank->mc_misc);
491     if ( broadcast )
492         for_each_vcpu ( d, v )
493         {
494             if ( !v->vcpu_id )
495                 continue;
496             err = vcpu_fill_mc_msrs(v, MCG_STATUS_MCIP | MCG_STATUS_RIPV,
497                                     0, 0, 0);
498             if ( err )
499                 ret = err;
500         }
501 
502     return ret;
503 }
504 
505 /* It's said some ram is setup as mmio_direct for UC cache attribute */
506 #define P2M_UNMAP_TYPES (p2m_to_mask(p2m_ram_rw) \
507                                 | p2m_to_mask(p2m_ram_logdirty) \
508                                 | p2m_to_mask(p2m_ram_ro)       \
509                                 | p2m_to_mask(p2m_mmio_direct))
510 
511 /*
512  * Currently all CPUs are redenzevous at the MCE softirq handler, no
513  * need to consider paging p2m type
514  * Currently only support HVM guest with EPT paging mode
515  * XXX following situation missed:
516  * PoD, Foreign mapped, Granted, Shared
517  */
unmmap_broken_page(struct domain * d,mfn_t mfn,unsigned long gfn)518 int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn)
519 {
520     mfn_t r_mfn;
521     p2m_type_t pt;
522     int rc;
523 
524     /* Always trust dom0's MCE handler will prevent future access */
525     if ( is_hardware_domain(d) )
526         return 0;
527 
528     if ( !mfn_valid(mfn) )
529         return -EINVAL;
530 
531     if ( !is_hvm_domain(d) || !paging_mode_hap(d) )
532         return -EOPNOTSUPP;
533 
534     rc = -1;
535     r_mfn = get_gfn_query(d, gfn, &pt);
536     if ( p2m_to_mask(pt) & P2M_UNMAP_TYPES)
537     {
538         ASSERT(mfn_eq(r_mfn, mfn));
539         rc = p2m_change_type_one(d, gfn, pt, p2m_ram_broken);
540     }
541     put_gfn(d, gfn);
542 
543     return rc;
544 }
545 
vmce_enable_mca_cap(struct domain * d,uint64_t cap)546 int vmce_enable_mca_cap(struct domain *d, uint64_t cap)
547 {
548     struct vcpu *v;
549 
550     if ( cap & ~XEN_HVM_MCA_CAP_MASK )
551         return -EINVAL;
552 
553     if ( cap & XEN_HVM_MCA_CAP_LMCE )
554     {
555         if ( !lmce_support )
556             return -EINVAL;
557         for_each_vcpu(d, v)
558             v->arch.vmce.mcg_cap |= MCG_LMCE_P;
559     }
560 
561     return 0;
562 }
563