1 /******************************************************************************
2 * arch/x86/pv/domain.c
3 *
4 * PV domain handling
5 */
6
7 #include <xen/domain_page.h>
8 #include <xen/errno.h>
9 #include <xen/lib.h>
10 #include <xen/param.h>
11 #include <xen/sched.h>
12
13 #include <asm/cpufeature.h>
14 #include <asm/invpcid.h>
15 #include <asm/spec_ctrl.h>
16 #include <asm/pv/domain.h>
17 #include <asm/shadow.h>
18
19 #ifdef CONFIG_PV32
20 int8_t __read_mostly opt_pv32 = -1;
21 #endif
22
parse_pv(const char * s)23 static __init int parse_pv(const char *s)
24 {
25 const char *ss;
26 int val, rc = 0;
27
28 do {
29 ss = strchr(s, ',');
30 if ( !ss )
31 ss = strchr(s, '\0');
32
33 if ( (val = parse_boolean("32", s, ss)) >= 0 )
34 {
35 #ifdef CONFIG_PV32
36 opt_pv32 = val;
37 #else
38 no_config_param("PV32", "pv", s, ss);
39 #endif
40 }
41 else
42 rc = -EINVAL;
43
44 s = ss + 1;
45 } while ( *ss );
46
47 return rc;
48 }
49 custom_param("pv", parse_pv);
50
51 static __read_mostly enum {
52 PCID_OFF,
53 PCID_ALL,
54 PCID_XPTI,
55 PCID_NOXPTI
56 } opt_pcid = PCID_XPTI;
57
58 #ifdef CONFIG_HYPFS
59 static const char opt_pcid_2_string[][7] = {
60 [PCID_OFF] = "off",
61 [PCID_ALL] = "on",
62 [PCID_XPTI] = "xpti",
63 [PCID_NOXPTI] = "noxpti",
64 };
65
opt_pcid_init(struct param_hypfs * par)66 static void __init opt_pcid_init(struct param_hypfs *par)
67 {
68 custom_runtime_set_var(par, opt_pcid_2_string[opt_pcid]);
69 }
70 #endif
71
72 static int parse_pcid(const char *s);
73 custom_runtime_param("pcid", parse_pcid, opt_pcid_init);
74
parse_pcid(const char * s)75 static int parse_pcid(const char *s)
76 {
77 int rc = 0;
78
79 switch ( parse_bool(s, NULL) )
80 {
81 case 0:
82 opt_pcid = PCID_OFF;
83 break;
84
85 case 1:
86 opt_pcid = PCID_ALL;
87 break;
88
89 default:
90 switch ( parse_boolean("xpti", s, NULL) )
91 {
92 case 0:
93 opt_pcid = PCID_NOXPTI;
94 break;
95
96 case 1:
97 opt_pcid = PCID_XPTI;
98 break;
99
100 default:
101 rc = -EINVAL;
102 break;
103 }
104 break;
105 }
106
107 custom_runtime_set_var(param_2_parfs(parse_pcid),
108 opt_pcid_2_string[opt_pcid]);
109
110 return rc;
111 }
112
continue_nonidle_domain(void)113 static void noreturn continue_nonidle_domain(void)
114 {
115 check_wakeup_from_wait();
116 reset_stack_and_jump(ret_from_intr);
117 }
118
setup_compat_l4(struct vcpu * v)119 static int setup_compat_l4(struct vcpu *v)
120 {
121 struct page_info *pg;
122 l4_pgentry_t *l4tab;
123 mfn_t mfn;
124
125 pg = alloc_domheap_page(v->domain, MEMF_no_owner | MEMF_no_scrub);
126 if ( pg == NULL )
127 return -ENOMEM;
128
129 mfn = page_to_mfn(pg);
130 l4tab = map_domain_page(mfn);
131 clear_page(l4tab);
132 init_xen_l4_slots(l4tab, mfn, v->domain, INVALID_MFN, false);
133 unmap_domain_page(l4tab);
134
135 /* This page needs to look like a pagetable so that it can be shadowed */
136 pg->u.inuse.type_info = PGT_l4_page_table | PGT_validated | 1;
137
138 v->arch.guest_table = pagetable_from_page(pg);
139 v->arch.guest_table_user = v->arch.guest_table;
140
141 return 0;
142 }
143
release_compat_l4(struct vcpu * v)144 static void release_compat_l4(struct vcpu *v)
145 {
146 if ( !pagetable_is_null(v->arch.guest_table) )
147 free_domheap_page(pagetable_get_page(v->arch.guest_table));
148 v->arch.guest_table = pagetable_null();
149 v->arch.guest_table_user = pagetable_null();
150 }
151
pv_fixup_guest_cr4(const struct vcpu * v,unsigned long cr4)152 unsigned long pv_fixup_guest_cr4(const struct vcpu *v, unsigned long cr4)
153 {
154 const struct cpuid_policy *p = v->domain->arch.cpuid;
155
156 /* Discard attempts to set guest controllable bits outside of the policy. */
157 cr4 &= ~((p->basic.tsc ? 0 : X86_CR4_TSD) |
158 (p->basic.de ? 0 : X86_CR4_DE) |
159 (p->feat.fsgsbase ? 0 : X86_CR4_FSGSBASE) |
160 (p->basic.xsave ? 0 : X86_CR4_OSXSAVE));
161
162 /* Masks expected to be disjoint sets. */
163 BUILD_BUG_ON(PV_CR4_GUEST_MASK & PV_CR4_GUEST_VISIBLE_MASK);
164
165 /*
166 * A guest sees the policy subset of its own choice of guest controllable
167 * bits, and a subset of Xen's choice of certain hardware settings.
168 */
169 return ((cr4 & PV_CR4_GUEST_MASK) |
170 (mmu_cr4_features & PV_CR4_GUEST_VISIBLE_MASK));
171 }
172
173 static int8_t __read_mostly opt_global_pages = -1;
174 boolean_runtime_param("global-pages", opt_global_pages);
175
pge_init(void)176 static int __init pge_init(void)
177 {
178 if ( opt_global_pages == -1 )
179 opt_global_pages = !cpu_has_hypervisor ||
180 !(boot_cpu_data.x86_vendor &
181 (X86_VENDOR_AMD | X86_VENDOR_HYGON));
182
183 return 0;
184 }
185 __initcall(pge_init);
186
pv_make_cr4(const struct vcpu * v)187 unsigned long pv_make_cr4(const struct vcpu *v)
188 {
189 const struct domain *d = v->domain;
190 unsigned long cr4 = mmu_cr4_features &
191 ~(X86_CR4_PCIDE | X86_CR4_PGE | X86_CR4_TSD);
192
193 /*
194 * PCIDE or PGE depends on the PCID/XPTI settings, but must not both be
195 * set, as it impacts the safety of TLB flushing.
196 */
197 if ( d->arch.pv.pcid )
198 cr4 |= X86_CR4_PCIDE;
199 else if ( !d->arch.pv.xpti && opt_global_pages )
200 cr4 |= X86_CR4_PGE;
201
202 /*
203 * TSD is needed if either the guest has elected to use it, or Xen is
204 * virtualising the TSC value the guest sees.
205 */
206 if ( d->arch.vtsc || (v->arch.pv.ctrlreg[4] & X86_CR4_TSD) )
207 cr4 |= X86_CR4_TSD;
208
209 /*
210 * The {RD,WR}{FS,GS}BASE are only useable in 64bit code segments. While
211 * we must not have CR4.FSGSBASE set behind the back of a 64bit PV kernel,
212 * we do leave it set in 32bit PV context to speed up Xen's context switch
213 * path.
214 */
215 if ( !is_pv_32bit_domain(d) && !(v->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) )
216 cr4 &= ~X86_CR4_FSGSBASE;
217
218 return cr4;
219 }
220
switch_compat(struct domain * d)221 int switch_compat(struct domain *d)
222 {
223 struct vcpu *v;
224 int rc;
225
226 BUILD_BUG_ON(offsetof(struct shared_info, vcpu_info) != 0);
227
228 if ( !opt_pv32 )
229 return -EOPNOTSUPP;
230 if ( is_hvm_domain(d) || domain_tot_pages(d) != 0 )
231 return -EACCES;
232 if ( is_pv_32bit_domain(d) )
233 return 0;
234
235 d->arch.has_32bit_shinfo = 1;
236 d->arch.pv.is_32bit = true;
237
238 for_each_vcpu( d, v )
239 {
240 if ( (rc = setup_compat_arg_xlat(v)) ||
241 (rc = setup_compat_l4(v)) )
242 goto undo_and_fail;
243 }
244
245 domain_set_alloc_bitsize(d);
246 recalculate_cpuid_policy(d);
247
248 d->arch.x87_fip_width = 4;
249
250 d->arch.pv.xpti = false;
251 d->arch.pv.pcid = false;
252
253 return 0;
254
255 undo_and_fail:
256 d->arch.pv.is_32bit = d->arch.has_32bit_shinfo = false;
257 for_each_vcpu( d, v )
258 {
259 free_compat_arg_xlat(v);
260 release_compat_l4(v);
261 }
262
263 return rc;
264 }
265
pv_create_gdt_ldt_l1tab(struct vcpu * v)266 static int pv_create_gdt_ldt_l1tab(struct vcpu *v)
267 {
268 return create_perdomain_mapping(v->domain, GDT_VIRT_START(v),
269 1U << GDT_LDT_VCPU_SHIFT,
270 v->domain->arch.pv.gdt_ldt_l1tab,
271 NULL);
272 }
273
pv_destroy_gdt_ldt_l1tab(struct vcpu * v)274 static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
275 {
276 destroy_perdomain_mapping(v->domain, GDT_VIRT_START(v),
277 1U << GDT_LDT_VCPU_SHIFT);
278 }
279
pv_vcpu_destroy(struct vcpu * v)280 void pv_vcpu_destroy(struct vcpu *v)
281 {
282 if ( is_pv_32bit_vcpu(v) )
283 {
284 free_compat_arg_xlat(v);
285 release_compat_l4(v);
286 }
287
288 pv_destroy_gdt_ldt_l1tab(v);
289 XFREE(v->arch.pv.trap_ctxt);
290 }
291
pv_vcpu_initialise(struct vcpu * v)292 int pv_vcpu_initialise(struct vcpu *v)
293 {
294 struct domain *d = v->domain;
295 int rc;
296
297 ASSERT(!is_idle_domain(d));
298
299 rc = pv_create_gdt_ldt_l1tab(v);
300 if ( rc )
301 return rc;
302
303 BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
304 PAGE_SIZE);
305 v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
306 if ( !v->arch.pv.trap_ctxt )
307 {
308 rc = -ENOMEM;
309 goto done;
310 }
311
312 /* PV guests by default have a 100Hz ticker. */
313 v->periodic_period = MILLISECS(10);
314
315 v->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(v, 0);
316
317 if ( is_pv_32bit_domain(d) )
318 {
319 if ( (rc = setup_compat_arg_xlat(v)) )
320 goto done;
321
322 if ( (rc = setup_compat_l4(v)) )
323 goto done;
324 }
325
326 done:
327 if ( rc )
328 pv_vcpu_destroy(v);
329 return rc;
330 }
331
pv_domain_destroy(struct domain * d)332 void pv_domain_destroy(struct domain *d)
333 {
334 pv_l1tf_domain_destroy(d);
335
336 destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
337 GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
338
339 XFREE(d->arch.pv.cpuidmasks);
340
341 FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
342 }
343
344
pv_domain_initialise(struct domain * d)345 int pv_domain_initialise(struct domain *d)
346 {
347 static const struct arch_csw pv_csw = {
348 .from = paravirt_ctxt_switch_from,
349 .to = paravirt_ctxt_switch_to,
350 .tail = continue_nonidle_domain,
351 };
352 int rc = -ENOMEM;
353
354 pv_l1tf_domain_init(d);
355
356 d->arch.pv.gdt_ldt_l1tab =
357 alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
358 if ( !d->arch.pv.gdt_ldt_l1tab )
359 goto fail;
360 clear_page(d->arch.pv.gdt_ldt_l1tab);
361
362 if ( levelling_caps & ~LCAP_faulting &&
363 (d->arch.pv.cpuidmasks = xmemdup(&cpuidmask_defaults)) == NULL )
364 goto fail;
365
366 rc = create_perdomain_mapping(d, GDT_LDT_VIRT_START,
367 GDT_LDT_MBYTES << (20 - PAGE_SHIFT),
368 NULL, NULL);
369 if ( rc )
370 goto fail;
371
372 d->arch.ctxt_switch = &pv_csw;
373
374 d->arch.pv.xpti = is_hardware_domain(d) ? opt_xpti_hwdom : opt_xpti_domu;
375
376 if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid )
377 switch ( ACCESS_ONCE(opt_pcid) )
378 {
379 case PCID_OFF:
380 break;
381
382 case PCID_ALL:
383 d->arch.pv.pcid = true;
384 break;
385
386 case PCID_XPTI:
387 d->arch.pv.pcid = d->arch.pv.xpti;
388 break;
389
390 case PCID_NOXPTI:
391 d->arch.pv.pcid = !d->arch.pv.xpti;
392 break;
393
394 default:
395 ASSERT_UNREACHABLE();
396 break;
397 }
398
399 return 0;
400
401 fail:
402 pv_domain_destroy(d);
403
404 return rc;
405 }
406
xpti_pcid_enabled(void)407 bool __init xpti_pcid_enabled(void)
408 {
409 return use_invpcid && cpu_has_pcid &&
410 (opt_pcid == PCID_ALL || opt_pcid == PCID_XPTI);
411 }
412
_toggle_guest_pt(struct vcpu * v)413 static void _toggle_guest_pt(struct vcpu *v)
414 {
415 unsigned long cr3;
416
417 v->arch.flags ^= TF_kernel_mode;
418 update_cr3(v);
419
420 /*
421 * Don't flush user global mappings from the TLB. Don't tick TLB clock.
422 *
423 * In shadow mode, though, update_cr3() may need to be accompanied by a
424 * TLB flush (for just the incoming PCID), as the top level page table may
425 * have changed behind our backs. To be on the safe side, suppress the
426 * no-flush unconditionally in this case.
427 */
428 cr3 = v->arch.cr3;
429 if ( shadow_mode_enabled(v->domain) )
430 cr3 &= ~X86_CR3_NOFLUSH;
431 write_cr3(cr3);
432
433 if ( !(v->arch.flags & TF_kernel_mode) )
434 return;
435
436 if ( v->arch.pv.need_update_runstate_area && update_runstate_area(v) )
437 v->arch.pv.need_update_runstate_area = 0;
438
439 if ( v->arch.pv.pending_system_time.version &&
440 update_secondary_system_time(v, &v->arch.pv.pending_system_time) )
441 v->arch.pv.pending_system_time.version = 0;
442 }
443
toggle_guest_mode(struct vcpu * v)444 void toggle_guest_mode(struct vcpu *v)
445 {
446 const struct domain *d = v->domain;
447 unsigned long gs_base;
448
449 ASSERT(!is_pv_32bit_vcpu(v));
450
451 /*
452 * Update the cached value of the GS base about to become inactive, as a
453 * subsequent context switch won't bother re-reading it.
454 */
455 gs_base = rdgsbase();
456 if ( v->arch.flags & TF_kernel_mode )
457 v->arch.pv.gs_base_kernel = gs_base;
458 else
459 v->arch.pv.gs_base_user = gs_base;
460 asm volatile ( "swapgs" );
461
462 _toggle_guest_pt(v);
463
464 if ( d->arch.pv.xpti )
465 {
466 struct cpu_info *cpu_info = get_cpu_info();
467
468 cpu_info->root_pgt_changed = true;
469 cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
470 (d->arch.pv.pcid ? get_pcid_bits(v, true) : 0);
471 /*
472 * As in _toggle_guest_pt() the XPTI CR3 write needs to be a TLB-
473 * flushing one too for shadow mode guests.
474 */
475 if ( shadow_mode_enabled(d) )
476 cpu_info->pv_cr3 &= ~X86_CR3_NOFLUSH;
477 }
478 }
479
480 /*
481 * Must be called in matching pairs without returning to guest context
482 * inbetween.
483 */
toggle_guest_pt(struct vcpu * v)484 void toggle_guest_pt(struct vcpu *v)
485 {
486 if ( !is_pv_32bit_vcpu(v) )
487 _toggle_guest_pt(v);
488 }
489
490 /*
491 * Local variables:
492 * mode: C
493 * c-file-style: "BSD"
494 * c-basic-offset: 4
495 * tab-width: 4
496 * indent-tabs-mode: nil
497 * End:
498 */
499