1 #include <xen/types.h>
2 #include <xen/sched.h>
3 #include "mcaction.h"
4 #include "vmce.h"
5 #include "mce.h"
6 
7 static struct mcinfo_recovery *
mci_action_add_pageoffline(int bank,struct mc_info * mi,mfn_t mfn,uint32_t status)8 mci_action_add_pageoffline(int bank, struct mc_info *mi,
9                            mfn_t mfn, uint32_t status)
10 {
11     struct mcinfo_recovery *rec;
12 
13     if ( !mi )
14         return NULL;
15 
16     rec = x86_mcinfo_reserve(mi, sizeof(*rec), MC_TYPE_RECOVERY);
17     if ( !rec )
18     {
19         mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
20         return NULL;
21     }
22 
23     rec->mc_bank = bank;
24     rec->action_types = MC_ACTION_PAGE_OFFLINE;
25     rec->action_info.page_retire.mfn = mfn_x(mfn);
26     rec->action_info.page_retire.status = status;
27     return rec;
28 }
29 
30 mce_check_addr_t mc_check_addr = NULL;
31 
mce_register_addrcheck(mce_check_addr_t cbfunc)32 void mce_register_addrcheck(mce_check_addr_t cbfunc)
33 {
34     mc_check_addr = cbfunc;
35 }
36 
37 void
mc_memerr_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)38 mc_memerr_dhandler(struct mca_binfo *binfo,
39                    enum mce_result *result,
40                    const struct cpu_user_regs *regs)
41 {
42     struct mcinfo_bank *bank = binfo->mib;
43     struct mcinfo_global *global = binfo->mig;
44     struct domain *d;
45     mfn_t mfn;
46     unsigned long gfn;
47     uint32_t status;
48     int vmce_vcpuid;
49     unsigned int mc_vcpuid;
50 
51     if ( !mc_check_addr(bank->mc_status, bank->mc_misc, MC_ADDR_PHYSICAL) )
52     {
53         dprintk(XENLOG_WARNING,
54                 "No physical address provided for memory error\n");
55         return;
56     }
57 
58     mfn = maddr_to_mfn(bank->mc_addr);
59     if ( offline_page(mfn, 1, &status) )
60     {
61         dprintk(XENLOG_WARNING,
62                 "Failed to offline page %"PRI_mfn" for MCE error\n",
63                 mfn_x(mfn));
64         return;
65     }
66 
67     mci_action_add_pageoffline(binfo->bank, binfo->mi, mfn, status);
68 
69     /* This is free page */
70     if ( status & PG_OFFLINE_OFFLINED )
71         *result = MCER_RECOVERED;
72     else if ( status & PG_OFFLINE_AGAIN )
73         *result = MCER_CONTINUE;
74     else if ( status & PG_OFFLINE_PENDING )
75     {
76         /* This page has owner */
77         if ( status & PG_OFFLINE_OWNED )
78         {
79             bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
80             mce_printk(MCE_QUIET, "MCE: This error page is ownded"
81                        " by DOM %d\n", bank->mc_domid);
82             /*
83              * XXX: Cannot handle shared pages yet
84              * (this should identify all domains and gfn mapping to
85              *  the mfn in question)
86              */
87             BUG_ON( bank->mc_domid == DOMID_COW );
88             if ( bank->mc_domid != DOMID_XEN )
89             {
90                 d = get_domain_by_id(bank->mc_domid);
91                 ASSERT(d);
92                 gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
93 
94                 if ( unmmap_broken_page(d, mfn, gfn) )
95                 {
96                     printk("Unmap broken memory %"PRI_mfn" for DOM%d failed\n",
97                            mfn_x(mfn), d->domain_id);
98                     goto vmce_failed;
99                 }
100 
101                 mc_vcpuid = global->mc_vcpuid;
102                 if ( mc_vcpuid == XEN_MC_VCPUID_INVALID ||
103                      /*
104                       * Because MC# may happen asynchronously with the actual
105                       * operation that triggers the error, the domain ID as
106                       * well as the vCPU ID collected in 'global' at MC# are
107                       * not always precise. In that case, fallback to broadcast.
108                       */
109                      global->mc_domid != bank->mc_domid ||
110                      (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
111                       (!(global->mc_gstatus & MCG_STATUS_LMCE) ||
112                        !(d->vcpu[mc_vcpuid]->arch.vmce.mcg_ext_ctl &
113                          MCG_EXT_CTL_LMCE_EN))) )
114                     vmce_vcpuid = VMCE_INJECT_BROADCAST;
115                 else
116                     vmce_vcpuid = mc_vcpuid;
117 
118                 bank->mc_addr = gfn << PAGE_SHIFT |
119                                 (bank->mc_addr & (PAGE_SIZE - 1));
120                 if ( fill_vmsr_data(bank, d, global->mc_gstatus, vmce_vcpuid) )
121                 {
122                     mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
123                                "failed\n", bank->mc_domid);
124                     goto vmce_failed;
125                 }
126 
127                 /* We will inject vMCE to DOMU */
128                 if ( inject_vmce(d, vmce_vcpuid) < 0 )
129                 {
130                     mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
131                                " failed\n", d->domain_id);
132                     goto vmce_failed;
133                 }
134 
135                 /*
136                  * Impacted domain go on with domain's recovery job
137                  * if the domain has its own MCA handler.
138                  * For xen, it has contained the error and finished
139                  * its own recovery job.
140                  */
141                 *result = MCER_RECOVERED;
142                 put_domain(d);
143 
144                 return;
145 vmce_failed:
146                 put_domain(d);
147                 domain_crash(d);
148             }
149         }
150     }
151 }
152