1 /******************************************************************************
2  * arch/x86/pv/domain.c
3  *
4  * PV domain handling
5  */
6 
7 #include <xen/domain_page.h>
8 #include <xen/errno.h>
9 #include <xen/lib.h>
10 #include <xen/param.h>
11 #include <xen/sched.h>
12 
13 #include <asm/cpufeature.h>
14 #include <asm/invpcid.h>
15 #include <asm/spec_ctrl.h>
16 #include <asm/pv/domain.h>
17 #include <asm/shadow.h>
18 
19 #ifdef CONFIG_PV32
20 int8_t __read_mostly opt_pv32 = -1;
21 #endif
22 
parse_pv(const char * s)23 static __init int parse_pv(const char *s)
24 {
25     const char *ss;
26     int val, rc = 0;
27 
28     do {
29         ss = strchr(s, ',');
30         if ( !ss )
31             ss = strchr(s, '\0');
32 
33         if ( (val = parse_boolean("32", s, ss)) >= 0 )
34         {
35 #ifdef CONFIG_PV32
36             opt_pv32 = val;
37 #else
38             no_config_param("PV32", "pv", s, ss);
39 #endif
40         }
41         else
42             rc = -EINVAL;
43 
44         s = ss + 1;
45     } while ( *ss );
46 
47     return rc;
48 }
49 custom_param("pv", parse_pv);
50 
51 static __read_mostly enum {
52     PCID_OFF,
53     PCID_ALL,
54     PCID_XPTI,
55     PCID_NOXPTI
56 } opt_pcid = PCID_XPTI;
57 
58 #ifdef CONFIG_HYPFS
59 static const char opt_pcid_2_string[][7] = {
60     [PCID_OFF] = "off",
61     [PCID_ALL] = "on",
62     [PCID_XPTI] = "xpti",
63     [PCID_NOXPTI] = "noxpti",
64 };
65 
opt_pcid_init(struct param_hypfs * par)66 static void __init opt_pcid_init(struct param_hypfs *par)
67 {
68     custom_runtime_set_var(par, opt_pcid_2_string[opt_pcid]);
69 }
70 #endif
71 
72 static int parse_pcid(const char *s);
73 custom_runtime_param("pcid", parse_pcid, opt_pcid_init);
74 
parse_pcid(const char * s)75 static int parse_pcid(const char *s)
76 {
77     int rc = 0;
78 
79     switch ( parse_bool(s, NULL) )
80     {
81     case 0:
82         opt_pcid = PCID_OFF;
83         break;
84 
85     case 1:
86         opt_pcid = PCID_ALL;
87         break;
88 
89     default:
90         switch ( parse_boolean("xpti", s, NULL) )
91         {
92         case 0:
93             opt_pcid = PCID_NOXPTI;
94             break;
95 
96         case 1:
97             opt_pcid = PCID_XPTI;
98             break;
99 
100         default:
101             rc = -EINVAL;
102             break;
103         }
104         break;
105     }
106 
107     custom_runtime_set_var(param_2_parfs(parse_pcid),
108                            opt_pcid_2_string[opt_pcid]);
109 
110     return rc;
111 }
112 
continue_nonidle_domain(void)113 static void noreturn continue_nonidle_domain(void)
114 {
115     check_wakeup_from_wait();
116     reset_stack_and_jump(ret_from_intr);
117 }
118 
setup_compat_l4(struct vcpu * v)119 static int setup_compat_l4(struct vcpu *v)
120 {
121     struct page_info *pg;
122     l4_pgentry_t *l4tab;
123     mfn_t mfn;
124 
125     pg = alloc_domheap_page(v->domain, MEMF_no_owner | MEMF_no_scrub);
126     if ( pg == NULL )
127         return -ENOMEM;
128 
129     mfn = page_to_mfn(pg);
130     l4tab = map_domain_page(mfn);
131     clear_page(l4tab);
132     init_xen_l4_slots(l4tab, mfn, v->domain, INVALID_MFN, false);
133     unmap_domain_page(l4tab);
134 
135     /* This page needs to look like a pagetable so that it can be shadowed */
136     pg->u.inuse.type_info = PGT_l4_page_table | PGT_validated | 1;
137 
138     v->arch.guest_table = pagetable_from_page(pg);
139     v->arch.guest_table_user = v->arch.guest_table;
140 
141     return 0;
142 }
143 
release_compat_l4(struct vcpu * v)144 static void release_compat_l4(struct vcpu *v)
145 {
146     if ( !pagetable_is_null(v->arch.guest_table) )
147         free_domheap_page(pagetable_get_page(v->arch.guest_table));
148     v->arch.guest_table = pagetable_null();
149     v->arch.guest_table_user = pagetable_null();
150 }
151 
pv_fixup_guest_cr4(const struct vcpu * v,unsigned long cr4)152 unsigned long pv_fixup_guest_cr4(const struct vcpu *v, unsigned long cr4)
153 {
154     const struct cpuid_policy *p = v->domain->arch.cpuid;
155 
156     /* Discard attempts to set guest controllable bits outside of the policy. */
157     cr4 &= ~((p->basic.tsc     ? 0 : X86_CR4_TSD)      |
158              (p->basic.de      ? 0 : X86_CR4_DE)       |
159              (p->feat.fsgsbase ? 0 : X86_CR4_FSGSBASE) |
160              (p->basic.xsave   ? 0 : X86_CR4_OSXSAVE));
161 
162     /* Masks expected to be disjoint sets. */
163     BUILD_BUG_ON(PV_CR4_GUEST_MASK & PV_CR4_GUEST_VISIBLE_MASK);
164 
165     /*
166      * A guest sees the policy subset of its own choice of guest controllable
167      * bits, and a subset of Xen's choice of certain hardware settings.
168      */
169     return ((cr4 & PV_CR4_GUEST_MASK) |
170             (mmu_cr4_features & PV_CR4_GUEST_VISIBLE_MASK));
171 }
172 
173 static int8_t __read_mostly opt_global_pages = -1;
174 boolean_runtime_param("global-pages", opt_global_pages);
175 
pge_init(void)176 static int __init pge_init(void)
177 {
178     if ( opt_global_pages == -1 )
179         opt_global_pages = !cpu_has_hypervisor ||
180                            !(boot_cpu_data.x86_vendor &
181                              (X86_VENDOR_AMD | X86_VENDOR_HYGON));
182 
183     return 0;
184 }
185 __initcall(pge_init);
186 
pv_make_cr4(const struct vcpu * v)187 unsigned long pv_make_cr4(const struct vcpu *v)
188 {
189     const struct domain *d = v->domain;
190     unsigned long cr4 = mmu_cr4_features &
191         ~(X86_CR4_PCIDE | X86_CR4_PGE | X86_CR4_TSD);
192 
193     /*
194      * PCIDE or PGE depends on the PCID/XPTI settings, but must not both be
195      * set, as it impacts the safety of TLB flushing.
196      */
197     if ( d->arch.pv.pcid )
198         cr4 |= X86_CR4_PCIDE;
199     else if ( !d->arch.pv.xpti && opt_global_pages )
200         cr4 |= X86_CR4_PGE;
201 
202     /*
203      * TSD is needed if either the guest has elected to use it, or Xen is
204      * virtualising the TSC value the guest sees.
205      */
206     if ( d->arch.vtsc || (v->arch.pv.ctrlreg[4] & X86_CR4_TSD) )
207         cr4 |= X86_CR4_TSD;
208 
209     /*
210      * The {RD,WR}{FS,GS}BASE are only useable in 64bit code segments.  While
211      * we must not have CR4.FSGSBASE set behind the back of a 64bit PV kernel,
212      * we do leave it set in 32bit PV context to speed up Xen's context switch
213      * path.
214      */
215     if ( !is_pv_32bit_domain(d) && !(v->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) )
216         cr4 &= ~X86_CR4_FSGSBASE;
217 
218     return cr4;
219 }
220 
switch_compat(struct domain * d)221 int switch_compat(struct domain *d)
222 {
223     struct vcpu *v;
224     int rc;
225 
226     BUILD_BUG_ON(offsetof(struct shared_info, vcpu_info) != 0);
227 
228     if ( !opt_pv32 )
229         return -EOPNOTSUPP;
230     if ( is_hvm_domain(d) || domain_tot_pages(d) != 0 )
231         return -EACCES;
232     if ( is_pv_32bit_domain(d) )
233         return 0;
234 
235     d->arch.has_32bit_shinfo = 1;
236     d->arch.pv.is_32bit = true;
237 
238     for_each_vcpu( d, v )
239     {
240         if ( (rc = setup_compat_arg_xlat(v)) ||
241              (rc = setup_compat_l4(v)) )
242             goto undo_and_fail;
243     }
244 
245     domain_set_alloc_bitsize(d);
246     recalculate_cpuid_policy(d);
247 
248     d->arch.x87_fip_width = 4;
249 
250     d->arch.pv.xpti = false;
251     d->arch.pv.pcid = false;
252 
253     return 0;
254 
255  undo_and_fail:
256     d->arch.pv.is_32bit = d->arch.has_32bit_shinfo = false;
257     for_each_vcpu( d, v )
258     {
259         free_compat_arg_xlat(v);
260         release_compat_l4(v);
261     }
262 
263     return rc;
264 }
265 
pv_create_gdt_ldt_l1tab(struct vcpu * v)266 static int pv_create_gdt_ldt_l1tab(struct vcpu *v)
267 {
268     return create_perdomain_mapping(v->domain, GDT_VIRT_START(v),
269                                     1U << GDT_LDT_VCPU_SHIFT,
270                                     v->domain->arch.pv.gdt_ldt_l1tab,
271                                     NULL);
272 }
273 
pv_destroy_gdt_ldt_l1tab(struct vcpu * v)274 static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
275 {
276     destroy_perdomain_mapping(v->domain, GDT_VIRT_START(v),
277                               1U << GDT_LDT_VCPU_SHIFT);
278 }
279 
pv_vcpu_destroy(struct vcpu * v)280 void pv_vcpu_destroy(struct vcpu *v)
281 {
282     if ( is_pv_32bit_vcpu(v) )
283     {
284         free_compat_arg_xlat(v);
285         release_compat_l4(v);
286     }
287 
288     pv_destroy_gdt_ldt_l1tab(v);
289     XFREE(v->arch.pv.trap_ctxt);
290 }
291 
pv_vcpu_initialise(struct vcpu * v)292 int pv_vcpu_initialise(struct vcpu *v)
293 {
294     struct domain *d = v->domain;
295     int rc;
296 
297     ASSERT(!is_idle_domain(d));
298 
299     rc = pv_create_gdt_ldt_l1tab(v);
300     if ( rc )
301         return rc;
302 
303     BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
304                  PAGE_SIZE);
305     v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
306     if ( !v->arch.pv.trap_ctxt )
307     {
308         rc = -ENOMEM;
309         goto done;
310     }
311 
312     /* PV guests by default have a 100Hz ticker. */
313     v->periodic_period = MILLISECS(10);
314 
315     v->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(v, 0);
316 
317     if ( is_pv_32bit_domain(d) )
318     {
319         if ( (rc = setup_compat_arg_xlat(v)) )
320             goto done;
321 
322         if ( (rc = setup_compat_l4(v)) )
323             goto done;
324     }
325 
326  done:
327     if ( rc )
328         pv_vcpu_destroy(v);
329     return rc;
330 }
331 
pv_domain_destroy(struct domain * d)332 void pv_domain_destroy(struct domain *d)
333 {
334     pv_l1tf_domain_destroy(d);
335 
336     destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
337                               GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
338 
339     XFREE(d->arch.pv.cpuidmasks);
340 
341     FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
342 }
343 
344 
pv_domain_initialise(struct domain * d)345 int pv_domain_initialise(struct domain *d)
346 {
347     static const struct arch_csw pv_csw = {
348         .from = paravirt_ctxt_switch_from,
349         .to   = paravirt_ctxt_switch_to,
350         .tail = continue_nonidle_domain,
351     };
352     int rc = -ENOMEM;
353 
354     pv_l1tf_domain_init(d);
355 
356     d->arch.pv.gdt_ldt_l1tab =
357         alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
358     if ( !d->arch.pv.gdt_ldt_l1tab )
359         goto fail;
360     clear_page(d->arch.pv.gdt_ldt_l1tab);
361 
362     if ( levelling_caps & ~LCAP_faulting &&
363          (d->arch.pv.cpuidmasks = xmemdup(&cpuidmask_defaults)) == NULL )
364         goto fail;
365 
366     rc = create_perdomain_mapping(d, GDT_LDT_VIRT_START,
367                                   GDT_LDT_MBYTES << (20 - PAGE_SHIFT),
368                                   NULL, NULL);
369     if ( rc )
370         goto fail;
371 
372     d->arch.ctxt_switch = &pv_csw;
373 
374     d->arch.pv.xpti = is_hardware_domain(d) ? opt_xpti_hwdom : opt_xpti_domu;
375 
376     if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid )
377         switch ( ACCESS_ONCE(opt_pcid) )
378         {
379         case PCID_OFF:
380             break;
381 
382         case PCID_ALL:
383             d->arch.pv.pcid = true;
384             break;
385 
386         case PCID_XPTI:
387             d->arch.pv.pcid = d->arch.pv.xpti;
388             break;
389 
390         case PCID_NOXPTI:
391             d->arch.pv.pcid = !d->arch.pv.xpti;
392             break;
393 
394         default:
395             ASSERT_UNREACHABLE();
396             break;
397         }
398 
399     return 0;
400 
401   fail:
402     pv_domain_destroy(d);
403 
404     return rc;
405 }
406 
xpti_pcid_enabled(void)407 bool __init xpti_pcid_enabled(void)
408 {
409     return use_invpcid && cpu_has_pcid &&
410            (opt_pcid == PCID_ALL || opt_pcid == PCID_XPTI);
411 }
412 
_toggle_guest_pt(struct vcpu * v)413 static void _toggle_guest_pt(struct vcpu *v)
414 {
415     unsigned long cr3;
416 
417     v->arch.flags ^= TF_kernel_mode;
418     update_cr3(v);
419 
420     /*
421      * Don't flush user global mappings from the TLB. Don't tick TLB clock.
422      *
423      * In shadow mode, though, update_cr3() may need to be accompanied by a
424      * TLB flush (for just the incoming PCID), as the top level page table may
425      * have changed behind our backs. To be on the safe side, suppress the
426      * no-flush unconditionally in this case.
427      */
428     cr3 = v->arch.cr3;
429     if ( shadow_mode_enabled(v->domain) )
430         cr3 &= ~X86_CR3_NOFLUSH;
431     write_cr3(cr3);
432 
433     if ( !(v->arch.flags & TF_kernel_mode) )
434         return;
435 
436     if ( v->arch.pv.need_update_runstate_area && update_runstate_area(v) )
437         v->arch.pv.need_update_runstate_area = 0;
438 
439     if ( v->arch.pv.pending_system_time.version &&
440          update_secondary_system_time(v, &v->arch.pv.pending_system_time) )
441         v->arch.pv.pending_system_time.version = 0;
442 }
443 
toggle_guest_mode(struct vcpu * v)444 void toggle_guest_mode(struct vcpu *v)
445 {
446     const struct domain *d = v->domain;
447     unsigned long gs_base;
448 
449     ASSERT(!is_pv_32bit_vcpu(v));
450 
451     /*
452      * Update the cached value of the GS base about to become inactive, as a
453      * subsequent context switch won't bother re-reading it.
454      */
455     gs_base = rdgsbase();
456     if ( v->arch.flags & TF_kernel_mode )
457         v->arch.pv.gs_base_kernel = gs_base;
458     else
459         v->arch.pv.gs_base_user = gs_base;
460     asm volatile ( "swapgs" );
461 
462     _toggle_guest_pt(v);
463 
464     if ( d->arch.pv.xpti )
465     {
466         struct cpu_info *cpu_info = get_cpu_info();
467 
468         cpu_info->root_pgt_changed = true;
469         cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
470                            (d->arch.pv.pcid ? get_pcid_bits(v, true) : 0);
471         /*
472          * As in _toggle_guest_pt() the XPTI CR3 write needs to be a TLB-
473          * flushing one too for shadow mode guests.
474          */
475         if ( shadow_mode_enabled(d) )
476             cpu_info->pv_cr3 &= ~X86_CR3_NOFLUSH;
477     }
478 }
479 
480 /*
481  * Must be called in matching pairs without returning to guest context
482  * inbetween.
483  */
toggle_guest_pt(struct vcpu * v)484 void toggle_guest_pt(struct vcpu *v)
485 {
486     if ( !is_pv_32bit_vcpu(v) )
487         _toggle_guest_pt(v);
488 }
489 
490 /*
491  * Local variables:
492  * mode: C
493  * c-file-style: "BSD"
494  * c-basic-offset: 4
495  * tab-width: 4
496  * indent-tabs-mode: nil
497  * End:
498  */
499