1 /*
2 * vmce.c - provide software emulated vMCE support to guest
3 *
4 * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@intel.com>
5 * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <xen/init.h>
22 #include <xen/types.h>
23 #include <xen/irq.h>
24 #include <xen/event.h>
25 #include <xen/kernel.h>
26 #include <xen/delay.h>
27 #include <xen/smp.h>
28 #include <xen/mm.h>
29 #include <asm/hvm/save.h>
30 #include <asm/processor.h>
31 #include <public/hvm/params.h>
32 #include <public/sysctl.h>
33 #include <asm/system.h>
34 #include <asm/msr.h>
35 #include <asm/p2m.h>
36 #include <asm/pv/traps.h>
37
38 #include "mce.h"
39 #include "x86_mca.h"
40 #include "vmce.h"
41
42 /*
43 * MCG_SER_P: software error recovery supported
44 * MCG_TES_P: to avoid MCi_status bit56:53 model specific
45 * MCG_CMCI_P: expose CMCI capability but never really inject it to guest,
46 * for sake of performance since guest not polling periodically
47 */
48 #define INTEL_GUEST_MCG_CAP (MCG_SER_P | \
49 MCG_TES_P | \
50 MCG_CMCI_P | \
51 GUEST_MC_BANK_NUM)
52
53 #define AMD_GUEST_MCG_CAP GUEST_MC_BANK_NUM
54
vmce_init_vcpu(struct vcpu * v)55 void vmce_init_vcpu(struct vcpu *v)
56 {
57 int i;
58
59 /* global MCA MSRs init */
60 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
61 v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP;
62 else
63 v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP;
64
65 v->arch.vmce.mcg_status = 0;
66
67 /* per-bank MCA MSRs init */
68 for ( i = 0; i < GUEST_MC_BANK_NUM; i++ )
69 memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank));
70
71 spin_lock_init(&v->arch.vmce.lock);
72 }
73
vmce_restore_vcpu(struct vcpu * v,const struct hvm_vmce_vcpu * ctxt)74 int vmce_restore_vcpu(struct vcpu *v, const struct hvm_vmce_vcpu *ctxt)
75 {
76 unsigned long guest_mcg_cap;
77
78 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
79 guest_mcg_cap = INTEL_GUEST_MCG_CAP | MCG_LMCE_P;
80 else
81 guest_mcg_cap = AMD_GUEST_MCG_CAP;
82
83 if ( ctxt->caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P )
84 {
85 dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities"
86 " %#" PRIx64 " for %pv (supported: %#Lx)\n",
87 is_hvm_vcpu(v) ? "HVM" : "PV", ctxt->caps,
88 v, guest_mcg_cap & ~MCG_CAP_COUNT);
89 return -EPERM;
90 }
91
92 v->arch.vmce.mcg_cap = ctxt->caps;
93 v->arch.vmce.bank[0].mci_ctl2 = ctxt->mci_ctl2_bank0;
94 v->arch.vmce.bank[1].mci_ctl2 = ctxt->mci_ctl2_bank1;
95 v->arch.vmce.mcg_ext_ctl = ctxt->mcg_ext_ctl;
96
97 return 0;
98 }
99
100 /*
101 * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
102 * when migrating from old vMCE version to new vMCE.
103 */
bank_mce_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)104 static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
105 {
106 int ret = 1;
107 unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
108
109 *val = 0;
110
111 switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
112 {
113 case MSR_IA32_MC0_CTL:
114 /* stick all 1's to MCi_CTL */
115 *val = ~0UL;
116 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_CTL %#"PRIx64"\n",
117 v, bank, *val);
118 break;
119
120 case MSR_IA32_MC0_STATUS:
121 if ( bank < GUEST_MC_BANK_NUM )
122 {
123 *val = v->arch.vmce.bank[bank].mci_status;
124 if ( *val )
125 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_STATUS %#"PRIx64"\n",
126 v, bank, *val);
127 }
128 break;
129
130 case MSR_IA32_MC0_ADDR:
131 if ( bank < GUEST_MC_BANK_NUM )
132 {
133 *val = v->arch.vmce.bank[bank].mci_addr;
134 if ( *val )
135 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_ADDR %#"PRIx64"\n",
136 v, bank, *val);
137 }
138 break;
139
140 case MSR_IA32_MC0_MISC:
141 if ( bank < GUEST_MC_BANK_NUM )
142 {
143 *val = v->arch.vmce.bank[bank].mci_misc;
144 if ( *val )
145 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_MISC %#"PRIx64"\n",
146 v, bank, *val);
147 }
148 break;
149
150 default:
151 switch ( boot_cpu_data.x86_vendor )
152 {
153 case X86_VENDOR_INTEL:
154 ret = vmce_intel_rdmsr(v, msr, val);
155 break;
156
157 case X86_VENDOR_AMD:
158 case X86_VENDOR_HYGON:
159 ret = vmce_amd_rdmsr(v, msr, val);
160 break;
161
162 default:
163 ret = 0;
164 break;
165 }
166 break;
167 }
168
169 return ret;
170 }
171
172 /*
173 * < 0: Unsupported and will #GP fault to guest
174 * = 0: Not handled, should be handled by other components
175 * > 0: Success
176 */
vmce_rdmsr(uint32_t msr,uint64_t * val)177 int vmce_rdmsr(uint32_t msr, uint64_t *val)
178 {
179 struct vcpu *cur = current;
180 int ret = 1;
181
182 *val = 0;
183
184 spin_lock(&cur->arch.vmce.lock);
185
186 switch ( msr )
187 {
188 case MSR_IA32_MCG_STATUS:
189 *val = cur->arch.vmce.mcg_status;
190 if ( *val )
191 mce_printk(MCE_VERBOSE,
192 "MCE: %pv: rd MCG_STATUS %#"PRIx64"\n", cur, *val);
193 break;
194
195 case MSR_IA32_MCG_CAP:
196 *val = cur->arch.vmce.mcg_cap;
197 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CAP %#"PRIx64"\n", cur, *val);
198 break;
199
200 case MSR_IA32_MCG_CTL:
201 if ( cur->arch.vmce.mcg_cap & MCG_CTL_P )
202 *val = ~0ULL;
203 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CTL %#"PRIx64"\n", cur, *val);
204 break;
205
206 case MSR_IA32_MCG_EXT_CTL:
207 /*
208 * If MCG_LMCE_P is present in guest MSR_IA32_MCG_CAP, the LMCE and LOCK
209 * bits are always set in guest MSR_IA32_FEATURE_CONTROL by Xen, so it
210 * does not need to check them here.
211 */
212 if ( cur->arch.vmce.mcg_cap & MCG_LMCE_P )
213 {
214 *val = cur->arch.vmce.mcg_ext_ctl;
215 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL %#"PRIx64"\n",
216 cur, *val);
217 }
218 else
219 {
220 ret = -1;
221 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL, not supported\n",
222 cur);
223 }
224 break;
225
226 default:
227 ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0;
228 break;
229 }
230
231 spin_unlock(&cur->arch.vmce.lock);
232
233 return ret;
234 }
235
236 /*
237 * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
238 * when migratie from old vMCE version to new vMCE.
239 */
bank_mce_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)240 static int bank_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
241 {
242 int ret = 1;
243 unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
244
245 switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
246 {
247 case MSR_IA32_MC0_CTL:
248 /*
249 * if guest crazy clear any bit of MCi_CTL,
250 * treat it as not implement and ignore write change it.
251 */
252 break;
253
254 case MSR_IA32_MC0_STATUS:
255 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_STATUS %#"PRIx64"\n",
256 v, bank, val);
257 if ( val )
258 ret = -1;
259 else if ( bank < GUEST_MC_BANK_NUM )
260 v->arch.vmce.bank[bank].mci_status = val;
261 break;
262
263 case MSR_IA32_MC0_ADDR:
264 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_ADDR %#"PRIx64"\n",
265 v, bank, val);
266 if ( val )
267 ret = -1;
268 else if ( bank < GUEST_MC_BANK_NUM )
269 v->arch.vmce.bank[bank].mci_addr = val;
270 break;
271
272 case MSR_IA32_MC0_MISC:
273 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_MISC %#"PRIx64"\n",
274 v, bank, val);
275 if ( val )
276 ret = -1;
277 else if ( bank < GUEST_MC_BANK_NUM )
278 v->arch.vmce.bank[bank].mci_misc = val;
279 break;
280
281 default:
282 switch ( boot_cpu_data.x86_vendor )
283 {
284 case X86_VENDOR_INTEL:
285 ret = vmce_intel_wrmsr(v, msr, val);
286 break;
287
288 case X86_VENDOR_AMD:
289 case X86_VENDOR_HYGON:
290 ret = vmce_amd_wrmsr(v, msr, val);
291 break;
292
293 default:
294 ret = 0;
295 break;
296 }
297 break;
298 }
299
300 return ret;
301 }
302
303 /*
304 * < 0: Unsupported and will #GP fault to guest
305 * = 0: Not handled, should be handled by other components
306 * > 0: Success
307 */
vmce_wrmsr(uint32_t msr,uint64_t val)308 int vmce_wrmsr(uint32_t msr, uint64_t val)
309 {
310 struct vcpu *cur = current;
311 int ret = 1;
312
313 spin_lock(&cur->arch.vmce.lock);
314
315 switch ( msr )
316 {
317 case MSR_IA32_MCG_CTL:
318 /* If MCG_CTL exists then stick to all 1's, else ignore. */
319 break;
320
321 case MSR_IA32_MCG_STATUS:
322 cur->arch.vmce.mcg_status = val;
323 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_STATUS %"PRIx64"\n",
324 cur, val);
325 break;
326
327 case MSR_IA32_MCG_CAP:
328 /*
329 * According to Intel SDM, IA32_MCG_CAP is a read-only register,
330 * the effect of writing to the IA32_MCG_CAP is undefined. Here we
331 * treat writing as 'write not change'. Guest would not surprise.
332 */
333 mce_printk(MCE_VERBOSE, "MCE: %pv: MCG_CAP is r/o\n", cur);
334 break;
335
336 case MSR_IA32_MCG_EXT_CTL:
337 if ( (cur->arch.vmce.mcg_cap & MCG_LMCE_P) &&
338 !(val & ~MCG_EXT_CTL_LMCE_EN) )
339 cur->arch.vmce.mcg_ext_ctl = val;
340 else
341 ret = -1;
342 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_EXT_CTL %"PRIx64"%s\n",
343 cur, val, (ret == -1) ? ", not supported" : "");
344 break;
345
346 default:
347 ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
348 break;
349 }
350
351 spin_unlock(&cur->arch.vmce.lock);
352 return ret;
353 }
354
355 #if CONFIG_HVM
vmce_save_vcpu_ctxt(struct vcpu * v,hvm_domain_context_t * h)356 static int vmce_save_vcpu_ctxt(struct vcpu *v, hvm_domain_context_t *h)
357 {
358 struct hvm_vmce_vcpu ctxt = {
359 .caps = v->arch.vmce.mcg_cap,
360 .mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2,
361 .mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2,
362 .mcg_ext_ctl = v->arch.vmce.mcg_ext_ctl,
363 };
364
365 return hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
366 }
367
vmce_load_vcpu_ctxt(struct domain * d,hvm_domain_context_t * h)368 static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
369 {
370 unsigned int vcpuid = hvm_load_instance(h);
371 struct vcpu *v;
372 struct hvm_vmce_vcpu ctxt;
373 int err;
374
375 if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
376 {
377 dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
378 d->domain_id, vcpuid);
379 err = -EINVAL;
380 }
381 else
382 err = hvm_load_entry_zeroextend(VMCE_VCPU, h, &ctxt);
383
384 return err ?: vmce_restore_vcpu(v, &ctxt);
385 }
386
387 HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
388 vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
389 #endif
390
391 /*
392 * for Intel MCE, broadcast vMCE to all vcpus
393 * for AMD MCE, only inject vMCE to vcpu0
394 *
395 * @ d, domain to which would inject vmce
396 * @ vcpu,
397 * -1 (VMCE_INJECT_BROADCAST), broadcast vMCE to all vcpus
398 * >= 0, vcpu, the vMCE is injected to
399 */
inject_vmce(struct domain * d,int vcpu)400 int inject_vmce(struct domain *d, int vcpu)
401 {
402 struct vcpu *v;
403 int ret = -ESRCH;
404
405 for_each_vcpu ( d, v )
406 {
407 if ( vcpu != VMCE_INJECT_BROADCAST && vcpu != v->vcpu_id )
408 continue;
409
410 /* Don't inject to uninitialized VCPU. */
411 if ( !v->is_initialised )
412 continue;
413
414 if ( (is_hvm_domain(d) ||
415 pv_trap_callback_registered(v, TRAP_machine_check)) &&
416 !test_and_set_bool(v->arch.mce_pending) )
417 {
418 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to %pv\n", v);
419 vcpu_kick(v);
420 ret = 0;
421 }
422 else
423 {
424 mce_printk(MCE_QUIET, "Failed to inject vMCE to %pv\n", v);
425 ret = -EBUSY;
426 break;
427 }
428
429 if ( vcpu != VMCE_INJECT_BROADCAST )
430 break;
431 }
432
433 return ret;
434 }
435
vcpu_fill_mc_msrs(struct vcpu * v,uint64_t mcg_status,uint64_t mci_status,uint64_t mci_addr,uint64_t mci_misc)436 static int vcpu_fill_mc_msrs(struct vcpu *v, uint64_t mcg_status,
437 uint64_t mci_status, uint64_t mci_addr,
438 uint64_t mci_misc)
439 {
440 if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP )
441 {
442 mce_printk(MCE_QUIET, "MCE: %pv: guest has not handled previous"
443 " vMCE yet!\n", v);
444 return -EBUSY;
445 }
446
447 spin_lock(&v->arch.vmce.lock);
448
449 v->arch.vmce.mcg_status = mcg_status;
450 /*
451 * 1. Skip bank 0 to avoid 'bank 0 quirk' of old processors
452 * 2. Filter MCi_STATUS MSCOD model specific error code to guest
453 */
454 v->arch.vmce.bank[1].mci_status = mci_status & MCi_STATUS_MSCOD_MASK;
455 v->arch.vmce.bank[1].mci_addr = mci_addr;
456 v->arch.vmce.bank[1].mci_misc = mci_misc;
457
458 spin_unlock(&v->arch.vmce.lock);
459
460 return 0;
461 }
462
fill_vmsr_data(struct mcinfo_bank * mc_bank,struct domain * d,uint64_t gstatus,int vmce_vcpuid)463 int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
464 uint64_t gstatus, int vmce_vcpuid)
465 {
466 struct vcpu *v = d->vcpu[0];
467 bool broadcast = (vmce_vcpuid == VMCE_INJECT_BROADCAST);
468 int ret, err;
469
470 if ( mc_bank->mc_domid == DOMID_INVALID )
471 return -EINVAL;
472
473 if ( broadcast )
474 gstatus &= ~MCG_STATUS_LMCE;
475 else if ( gstatus & MCG_STATUS_LMCE )
476 {
477 ASSERT(vmce_vcpuid >= 0 && vmce_vcpuid < d->max_vcpus);
478 v = d->vcpu[vmce_vcpuid];
479 }
480
481 /*
482 * vMCE with the actual error information is injected to vCPU0,
483 * and, if broadcast is required, we choose to inject less severe
484 * vMCEs to other vCPUs. Thus guest can always get the severest
485 * error (i.e. the actual one) on vCPU0. If guest can recover from
486 * the severest error on vCPU0, the less severe errors on other
487 * vCPUs will not prevent guest from recovering on those vCPUs.
488 */
489 ret = vcpu_fill_mc_msrs(v, gstatus, mc_bank->mc_status,
490 mc_bank->mc_addr, mc_bank->mc_misc);
491 if ( broadcast )
492 for_each_vcpu ( d, v )
493 {
494 if ( !v->vcpu_id )
495 continue;
496 err = vcpu_fill_mc_msrs(v, MCG_STATUS_MCIP | MCG_STATUS_RIPV,
497 0, 0, 0);
498 if ( err )
499 ret = err;
500 }
501
502 return ret;
503 }
504
505 /* It's said some ram is setup as mmio_direct for UC cache attribute */
506 #define P2M_UNMAP_TYPES (p2m_to_mask(p2m_ram_rw) \
507 | p2m_to_mask(p2m_ram_logdirty) \
508 | p2m_to_mask(p2m_ram_ro) \
509 | p2m_to_mask(p2m_mmio_direct))
510
511 /*
512 * Currently all CPUs are redenzevous at the MCE softirq handler, no
513 * need to consider paging p2m type
514 * Currently only support HVM guest with EPT paging mode
515 * XXX following situation missed:
516 * PoD, Foreign mapped, Granted, Shared
517 */
unmmap_broken_page(struct domain * d,mfn_t mfn,unsigned long gfn)518 int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn)
519 {
520 mfn_t r_mfn;
521 p2m_type_t pt;
522 int rc;
523
524 /* Always trust dom0's MCE handler will prevent future access */
525 if ( is_hardware_domain(d) )
526 return 0;
527
528 if ( !mfn_valid(mfn) )
529 return -EINVAL;
530
531 if ( !is_hvm_domain(d) || !paging_mode_hap(d) )
532 return -EOPNOTSUPP;
533
534 rc = -1;
535 r_mfn = get_gfn_query(d, gfn, &pt);
536 if ( p2m_to_mask(pt) & P2M_UNMAP_TYPES)
537 {
538 ASSERT(mfn_eq(r_mfn, mfn));
539 rc = p2m_change_type_one(d, gfn, pt, p2m_ram_broken);
540 }
541 put_gfn(d, gfn);
542
543 return rc;
544 }
545
vmce_enable_mca_cap(struct domain * d,uint64_t cap)546 int vmce_enable_mca_cap(struct domain *d, uint64_t cap)
547 {
548 struct vcpu *v;
549
550 if ( cap & ~XEN_HVM_MCA_CAP_MASK )
551 return -EINVAL;
552
553 if ( cap & XEN_HVM_MCA_CAP_LMCE )
554 {
555 if ( !lmce_support )
556 return -EINVAL;
557 for_each_vcpu(d, v)
558 v->arch.vmce.mcg_cap |= MCG_LMCE_P;
559 }
560
561 return 0;
562 }
563