1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
6
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
13
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/domain.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <xen/kernel.h>
25 #include <xen/hypercall.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <xen/cpu.h>
36 #include <xen/wait.h>
37 #include <xen/guest_access.h>
38 #include <xen/livepatch.h>
39 #include <public/sysctl.h>
40 #include <public/hvm/hvm_vcpu.h>
41 #include <asm/altp2m.h>
42 #include <asm/regs.h>
43 #include <asm/mc146818rtc.h>
44 #include <asm/system.h>
45 #include <asm/io.h>
46 #include <asm/processor.h>
47 #include <asm/desc.h>
48 #include <asm/i387.h>
49 #include <asm/xstate.h>
50 #include <asm/cpuidle.h>
51 #include <asm/mpspec.h>
52 #include <asm/ldt.h>
53 #include <asm/hvm/hvm.h>
54 #include <asm/hvm/nestedhvm.h>
55 #include <asm/hvm/support.h>
56 #include <asm/hvm/svm/svm.h>
57 #include <asm/hvm/viridian.h>
58 #include <asm/debugreg.h>
59 #include <asm/msr.h>
60 #include <asm/spec_ctrl.h>
61 #include <asm/traps.h>
62 #include <asm/nmi.h>
63 #include <asm/mce.h>
64 #include <asm/amd.h>
65 #include <xen/numa.h>
66 #include <xen/iommu.h>
67 #include <compat/vcpu.h>
68 #include <asm/psr.h>
69 #include <asm/pv/domain.h>
70 #include <asm/pv/mm.h>
71 #include <asm/spec_ctrl.h>
72
73 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
74
75 static void default_idle(void);
76 void (*pm_idle) (void) __read_mostly = default_idle;
77 void (*dead_idle) (void) __read_mostly = default_dead_idle;
78
default_idle(void)79 static void default_idle(void)
80 {
81 struct cpu_info *info = get_cpu_info();
82
83 local_irq_disable();
84 if ( cpu_is_haltable(smp_processor_id()) )
85 {
86 spec_ctrl_enter_idle(info);
87 safe_halt();
88 spec_ctrl_exit_idle(info);
89 }
90 else
91 local_irq_enable();
92 }
93
default_dead_idle(void)94 void default_dead_idle(void)
95 {
96 /*
97 * When going into S3, without flushing caches modified data may be
98 * held by the CPUs spinning here indefinitely, and get discarded by
99 * a subsequent INIT.
100 */
101 spec_ctrl_enter_idle(get_cpu_info());
102 wbinvd();
103 halt();
104 spec_ctrl_exit_idle(get_cpu_info());
105 }
106
play_dead(void)107 void play_dead(void)
108 {
109 unsigned int cpu = smp_processor_id();
110
111 local_irq_disable();
112
113 /* Change the NMI handler to a nop (see comment below). */
114 _set_gate_lower(&idt_tables[cpu][TRAP_nmi], SYS_DESC_irq_gate, 0,
115 &trap_nop);
116
117 /*
118 * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible,
119 * as they may be freed at any time if offline CPUs don't get parked. In
120 * this case, heap corruption or #PF can occur (when heap debugging is
121 * enabled). For example, even printk() can involve tasklet scheduling,
122 * which touches per-cpu vars.
123 *
124 * Consider very carefully when adding code to *dead_idle. Most hypervisor
125 * subsystems are unsafe to call.
126 */
127 cpu_exit_clear(cpu);
128
129 for ( ; ; )
130 dead_idle();
131 }
132
idle_loop(void)133 static void noreturn idle_loop(void)
134 {
135 unsigned int cpu = smp_processor_id();
136 /*
137 * Idle vcpus might be attached to non-idle units! We don't do any
138 * standard idle work like tasklets or livepatching in this case.
139 */
140 bool guest = !is_idle_domain(current->sched_unit->domain);
141
142 for ( ; ; )
143 {
144 if ( cpu_is_offline(cpu) )
145 {
146 ASSERT(!guest);
147 play_dead();
148 }
149
150 /* Are we here for running vcpu context tasklets, or for idling? */
151 if ( !guest && unlikely(tasklet_work_to_do(cpu)) )
152 {
153 do_tasklet();
154 /* Livepatch work is always kicked off via a tasklet. */
155 check_for_livepatch_work();
156 }
157 /*
158 * Test softirqs twice --- first to see if should even try scrubbing
159 * and then, after it is done, whether softirqs became pending
160 * while we were scrubbing.
161 */
162 else if ( !softirq_pending(cpu) && !scrub_free_pages() &&
163 !softirq_pending(cpu) )
164 {
165 if ( guest )
166 sched_guest_idle(pm_idle, cpu);
167 else
168 pm_idle();
169 }
170 do_softirq();
171 }
172 }
173
startup_cpu_idle_loop(void)174 void startup_cpu_idle_loop(void)
175 {
176 struct vcpu *v = current;
177
178 ASSERT(is_idle_vcpu(v));
179 cpumask_set_cpu(v->processor, v->domain->dirty_cpumask);
180 write_atomic(&v->dirty_cpu, v->processor);
181
182 reset_stack_and_jump(idle_loop);
183 }
184
init_hypercall_page(struct domain * d,void * ptr)185 void init_hypercall_page(struct domain *d, void *ptr)
186 {
187 memset(ptr, 0xcc, PAGE_SIZE);
188
189 if ( is_hvm_domain(d) )
190 hvm_init_hypercall_page(d, ptr);
191 else if ( is_pv_64bit_domain(d) )
192 pv_ring3_init_hypercall_page(ptr);
193 else if ( is_pv_32bit_domain(d) )
194 pv_ring1_init_hypercall_page(ptr);
195 else
196 ASSERT_UNREACHABLE();
197 }
198
dump_pageframe_info(struct domain * d)199 void dump_pageframe_info(struct domain *d)
200 {
201 struct page_info *page;
202
203 printk("Memory pages belonging to domain %u:\n", d->domain_id);
204
205 if ( domain_tot_pages(d) >= 10 && d->is_dying < DOMDYING_dead )
206 {
207 printk(" DomPage list too long to display\n");
208 }
209 else
210 {
211 unsigned long total[MASK_EXTR(PGT_type_mask, PGT_type_mask) + 1] = {};
212
213 spin_lock(&d->page_alloc_lock);
214 page_list_for_each ( page, &d->page_list )
215 {
216 unsigned int index = MASK_EXTR(page->u.inuse.type_info,
217 PGT_type_mask);
218
219 if ( ++total[index] > 16 )
220 {
221 switch ( page->u.inuse.type_info & PGT_type_mask )
222 {
223 case PGT_none:
224 case PGT_writable_page:
225 continue;
226 }
227 }
228 printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
229 _p(mfn_x(page_to_mfn(page))),
230 page->count_info, page->u.inuse.type_info);
231 }
232 spin_unlock(&d->page_alloc_lock);
233 }
234
235 if ( is_hvm_domain(d) )
236 p2m_pod_dump_data(d);
237
238 spin_lock(&d->page_alloc_lock);
239
240 page_list_for_each ( page, &d->xenpage_list )
241 {
242 printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
243 _p(mfn_x(page_to_mfn(page))),
244 page->count_info, page->u.inuse.type_info);
245 }
246
247 page_list_for_each ( page, &d->extra_page_list )
248 {
249 printk(" ExtraPage %p: caf=%08lx, taf=%" PRtype_info "\n",
250 _p(mfn_x(page_to_mfn(page))),
251 page->count_info, page->u.inuse.type_info);
252 }
253
254 spin_unlock(&d->page_alloc_lock);
255 }
256
update_guest_memory_policy(struct vcpu * v,struct guest_memory_policy * policy)257 void update_guest_memory_policy(struct vcpu *v,
258 struct guest_memory_policy *policy)
259 {
260 bool old_guest_mode = nestedhvm_is_n2(v);
261 bool new_guest_mode = policy->nested_guest_mode;
262
263 /*
264 * When 'v' is in the nested guest mode, all guest copy
265 * functions/macros which finally call paging_gva_to_gfn()
266 * transfer data to/from L2 guest. If the copy is intended for L1
267 * guest, we must first clear the nested guest flag (by setting
268 * policy->nested_guest_mode to false) before the copy and then
269 * restore the nested guest flag (by setting
270 * policy->nested_guest_mode to true) after the copy.
271 */
272 if ( unlikely(old_guest_mode != new_guest_mode) )
273 {
274 if ( new_guest_mode )
275 nestedhvm_vcpu_enter_guestmode(v);
276 else
277 nestedhvm_vcpu_exit_guestmode(v);
278 policy->nested_guest_mode = old_guest_mode;
279 }
280 }
281
282 #ifndef CONFIG_BIGMEM
283 /*
284 * The hole may be at or above the 44-bit boundary, so we need to determine
285 * the total bit count until reaching 32 significant (not squashed out) bits
286 * in PFN representations.
287 * Note that the way "bits" gets initialized/updated/bounds-checked guarantees
288 * that the function will never return zero, and hence will never be called
289 * more than once (which is important due to it being deliberately placed in
290 * .init.text).
291 */
_domain_struct_bits(void)292 static unsigned int __init noinline _domain_struct_bits(void)
293 {
294 unsigned int bits = 32 + PAGE_SHIFT;
295 unsigned int sig = hweight32(~pfn_hole_mask);
296 unsigned int mask = pfn_hole_mask >> 32;
297
298 for ( ; bits < BITS_PER_LONG && sig < 32; ++bits, mask >>= 1 )
299 if ( !(mask & 1) )
300 ++sig;
301
302 return bits;
303 }
304 #endif
305
alloc_domain_struct(void)306 struct domain *alloc_domain_struct(void)
307 {
308 struct domain *d;
309 #ifdef CONFIG_BIGMEM
310 const unsigned int bits = 0;
311 #else
312 /*
313 * We pack the PDX of the domain structure into a 32-bit field within
314 * the page_info structure. Hence the MEMF_bits() restriction.
315 */
316 static unsigned int __read_mostly bits;
317
318 if ( unlikely(!bits) )
319 bits = _domain_struct_bits();
320 #endif
321
322 BUILD_BUG_ON(sizeof(*d) > PAGE_SIZE);
323 d = alloc_xenheap_pages(0, MEMF_bits(bits));
324 if ( d != NULL )
325 clear_page(d);
326 return d;
327 }
328
free_domain_struct(struct domain * d)329 void free_domain_struct(struct domain *d)
330 {
331 free_xenheap_page(d);
332 }
333
alloc_vcpu_struct(const struct domain * d)334 struct vcpu *alloc_vcpu_struct(const struct domain *d)
335 {
336 struct vcpu *v;
337 /*
338 * This structure contains embedded PAE PDPTEs, used when an HVM guest
339 * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
340 * may require that the shadow CR3 points below 4GB, and hence the whole
341 * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
342 */
343 unsigned int memflags =
344 (is_hvm_domain(d) && paging_mode_shadow(d)) ? MEMF_bits(32) : 0;
345
346 BUILD_BUG_ON(sizeof(*v) > PAGE_SIZE);
347 v = alloc_xenheap_pages(0, memflags);
348 if ( v != NULL )
349 clear_page(v);
350 return v;
351 }
352
free_vcpu_struct(struct vcpu * v)353 void free_vcpu_struct(struct vcpu *v)
354 {
355 free_xenheap_page(v);
356 }
357
358 /* Initialise various registers to their architectural INIT/RESET state. */
arch_vcpu_regs_init(struct vcpu * v)359 void arch_vcpu_regs_init(struct vcpu *v)
360 {
361 memset(&v->arch.user_regs, 0, sizeof(v->arch.user_regs));
362 v->arch.user_regs.eflags = X86_EFLAGS_MBS;
363
364 memset(v->arch.dr, 0, sizeof(v->arch.dr));
365 v->arch.dr6 = X86_DR6_DEFAULT;
366 v->arch.dr7 = X86_DR7_DEFAULT;
367 }
368
arch_vcpu_create(struct vcpu * v)369 int arch_vcpu_create(struct vcpu *v)
370 {
371 struct domain *d = v->domain;
372 int rc;
373
374 v->arch.flags = TF_kernel_mode;
375
376 rc = mapcache_vcpu_init(v);
377 if ( rc )
378 return rc;
379
380 if ( !is_idle_domain(d) )
381 {
382 paging_vcpu_init(v);
383
384 if ( (rc = vcpu_init_fpu(v)) != 0 )
385 return rc;
386
387 vmce_init_vcpu(v);
388
389 arch_vcpu_regs_init(v);
390
391 if ( (rc = init_vcpu_msr_policy(v)) )
392 goto fail;
393 }
394 else if ( (rc = xstate_alloc_save_area(v)) != 0 )
395 return rc;
396
397 spin_lock_init(&v->arch.vpmu.vpmu_lock);
398
399 if ( is_hvm_domain(d) )
400 rc = hvm_vcpu_initialise(v);
401 else if ( !is_idle_domain(d) )
402 rc = pv_vcpu_initialise(v);
403 else
404 {
405 /* Idle domain */
406 v->arch.cr3 = __pa(idle_pg_table);
407 rc = 0;
408 v->arch.msrs = ZERO_BLOCK_PTR; /* Catch stray misuses */
409 }
410
411 if ( rc )
412 goto fail;
413
414 if ( !is_idle_domain(v->domain) )
415 {
416 vpmu_initialise(v);
417
418 cpuid_policy_updated(v);
419 }
420
421 return rc;
422
423 fail:
424 vcpu_destroy_fpu(v);
425 xfree(v->arch.msrs);
426 v->arch.msrs = NULL;
427
428 return rc;
429 }
430
arch_vcpu_destroy(struct vcpu * v)431 void arch_vcpu_destroy(struct vcpu *v)
432 {
433 xfree(v->arch.vm_event);
434 v->arch.vm_event = NULL;
435
436 vcpu_destroy_fpu(v);
437
438 xfree(v->arch.msrs);
439 v->arch.msrs = NULL;
440
441 if ( is_hvm_vcpu(v) )
442 hvm_vcpu_destroy(v);
443 else
444 pv_vcpu_destroy(v);
445 }
446
arch_sanitise_domain_config(struct xen_domctl_createdomain * config)447 int arch_sanitise_domain_config(struct xen_domctl_createdomain *config)
448 {
449 bool hvm = config->flags & XEN_DOMCTL_CDF_hvm;
450 unsigned int max_vcpus;
451
452 if ( hvm ? !hvm_enabled : !IS_ENABLED(CONFIG_PV) )
453 {
454 dprintk(XENLOG_INFO, "%s support not available\n", hvm ? "HVM" : "PV");
455 return -EINVAL;
456 }
457
458 max_vcpus = hvm ? HVM_MAX_VCPUS : MAX_VIRT_CPUS;
459
460 if ( config->max_vcpus > max_vcpus )
461 {
462 dprintk(XENLOG_INFO, "Requested vCPUs (%u) exceeds max (%u)\n",
463 config->max_vcpus, max_vcpus);
464 return -EINVAL;
465 }
466
467 if ( !IS_ENABLED(CONFIG_TBOOT) &&
468 (config->flags & XEN_DOMCTL_CDF_s3_integrity) )
469 {
470 dprintk(XENLOG_INFO, "S3 integrity check not valid without CONFIG_TBOOT\n");
471 return -EINVAL;
472 }
473
474 if ( (config->flags & XEN_DOMCTL_CDF_hap) && !hvm_hap_supported() )
475 {
476 dprintk(XENLOG_INFO, "HAP requested but not supported\n");
477 return -EINVAL;
478 }
479
480 if ( !(config->flags & XEN_DOMCTL_CDF_hvm) )
481 /*
482 * It is only meaningful for XEN_DOMCTL_CDF_oos_off to be clear
483 * for HVM guests.
484 */
485 config->flags |= XEN_DOMCTL_CDF_oos_off;
486
487 return 0;
488 }
489
emulation_flags_ok(const struct domain * d,uint32_t emflags)490 static bool emulation_flags_ok(const struct domain *d, uint32_t emflags)
491 {
492 #ifdef CONFIG_HVM
493 /* This doesn't catch !CONFIG_HVM case but it is better than nothing */
494 BUILD_BUG_ON(X86_EMU_ALL != XEN_X86_EMU_ALL);
495 #endif
496
497 if ( is_hvm_domain(d) )
498 {
499 if ( is_hardware_domain(d) &&
500 emflags != (X86_EMU_VPCI | X86_EMU_LAPIC | X86_EMU_IOAPIC) )
501 return false;
502 if ( !is_hardware_domain(d) &&
503 emflags != (X86_EMU_ALL & ~X86_EMU_VPCI) &&
504 emflags != X86_EMU_LAPIC )
505 return false;
506 }
507 else if ( emflags != 0 && emflags != X86_EMU_PIT )
508 {
509 /* PV or classic PVH. */
510 return false;
511 }
512
513 return true;
514 }
515
arch_domain_create(struct domain * d,struct xen_domctl_createdomain * config)516 int arch_domain_create(struct domain *d,
517 struct xen_domctl_createdomain *config)
518 {
519 bool paging_initialised = false;
520 uint32_t emflags;
521 int rc;
522
523 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
524
525 spin_lock_init(&d->arch.e820_lock);
526
527 /* Minimal initialisation for the idle domain. */
528 if ( unlikely(is_idle_domain(d)) )
529 {
530 static const struct arch_csw idle_csw = {
531 .from = paravirt_ctxt_switch_from,
532 .to = paravirt_ctxt_switch_to,
533 .tail = idle_loop,
534 };
535
536 d->arch.ctxt_switch = &idle_csw;
537
538 d->arch.cpuid = ZERO_BLOCK_PTR; /* Catch stray misuses. */
539 d->arch.msr = ZERO_BLOCK_PTR;
540
541 return 0;
542 }
543
544 if ( !config )
545 {
546 /* Only IDLE is allowed with no config. */
547 ASSERT_UNREACHABLE();
548 return -EINVAL;
549 }
550
551 if ( d->domain_id && cpu_has_amd_erratum(&boot_cpu_data, AMD_ERRATUM_121) )
552 {
553 if ( !opt_allow_unsafe )
554 {
555 printk(XENLOG_G_ERR "Xen does not allow DomU creation on this CPU"
556 " for security reasons.\n");
557 return -EPERM;
558 }
559 printk(XENLOG_G_WARNING
560 "Dom%d may compromise security on this CPU.\n",
561 d->domain_id);
562 }
563
564 emflags = config->arch.emulation_flags;
565
566 if ( is_hardware_domain(d) && is_pv_domain(d) )
567 emflags |= XEN_X86_EMU_PIT;
568
569 if ( emflags & ~XEN_X86_EMU_ALL )
570 {
571 printk(XENLOG_G_ERR "d%d: Invalid emulation bitmap: %#x\n",
572 d->domain_id, emflags);
573 return -EINVAL;
574 }
575
576 if ( !emulation_flags_ok(d, emflags) )
577 {
578 printk(XENLOG_G_ERR "d%d: Xen does not allow %s domain creation "
579 "with the current selection of emulators: %#x\n",
580 d->domain_id, is_hvm_domain(d) ? "HVM" : "PV", emflags);
581 return -EOPNOTSUPP;
582 }
583 d->arch.emulation_flags = emflags;
584
585 HYPERVISOR_COMPAT_VIRT_START(d) =
586 is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u;
587
588 if ( (rc = paging_domain_init(d)) != 0 )
589 goto fail;
590 paging_initialised = true;
591
592 if ( (rc = init_domain_cpuid_policy(d)) )
593 goto fail;
594
595 if ( (rc = init_domain_msr_policy(d)) )
596 goto fail;
597
598 d->arch.ioport_caps =
599 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
600 rc = -ENOMEM;
601 if ( d->arch.ioport_caps == NULL )
602 goto fail;
603
604 /*
605 * The shared_info machine address must fit in a 32-bit field within a
606 * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
607 */
608 if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
609 goto fail;
610
611 clear_page(d->shared_info);
612 share_xen_page_with_guest(virt_to_page(d->shared_info), d, SHARE_rw);
613
614 if ( (rc = init_domain_irq_mapping(d)) != 0 )
615 goto fail;
616
617 if ( (rc = iommu_domain_init(d, config->iommu_opts)) != 0 )
618 goto fail;
619
620 psr_domain_init(d);
621
622 if ( is_hvm_domain(d) )
623 {
624 if ( (rc = hvm_domain_initialise(d)) != 0 )
625 goto fail;
626 }
627 else if ( is_pv_domain(d) )
628 {
629 mapcache_domain_init(d);
630
631 if ( (rc = pv_domain_initialise(d)) != 0 )
632 goto fail;
633 }
634 else
635 ASSERT_UNREACHABLE(); /* Not HVM and not PV? */
636
637 if ( (rc = tsc_set_info(d, TSC_MODE_DEFAULT, 0, 0, 0)) != 0 )
638 {
639 ASSERT_UNREACHABLE();
640 goto fail;
641 }
642
643 /* PV/PVH guests get an emulated PIT too for video BIOSes to use. */
644 pit_init(d, cpu_khz);
645
646 /*
647 * If the FPU does not save FCS/FDS then we can always
648 * save/restore the 64-bit FIP/FDP and ignore the selectors.
649 */
650 d->arch.x87_fip_width = cpu_has_fpu_sel ? 0 : 8;
651
652 domain_cpu_policy_changed(d);
653
654 return 0;
655
656 fail:
657 d->is_dying = DOMDYING_dead;
658 psr_domain_free(d);
659 iommu_domain_destroy(d);
660 cleanup_domain_irq_mapping(d);
661 free_xenheap_page(d->shared_info);
662 xfree(d->arch.cpuid);
663 xfree(d->arch.msr);
664 if ( paging_initialised )
665 paging_final_teardown(d);
666 free_perdomain_mappings(d);
667
668 return rc;
669 }
670
arch_domain_destroy(struct domain * d)671 void arch_domain_destroy(struct domain *d)
672 {
673 if ( is_hvm_domain(d) )
674 hvm_domain_destroy(d);
675
676 xfree(d->arch.e820);
677 xfree(d->arch.cpuid);
678 xfree(d->arch.msr);
679
680 free_domain_pirqs(d);
681 if ( !is_idle_domain(d) )
682 iommu_domain_destroy(d);
683
684 paging_final_teardown(d);
685
686 if ( is_pv_domain(d) )
687 pv_domain_destroy(d);
688 free_perdomain_mappings(d);
689
690 free_xenheap_page(d->shared_info);
691 cleanup_domain_irq_mapping(d);
692
693 psr_domain_free(d);
694 }
695
arch_domain_shutdown(struct domain * d)696 void arch_domain_shutdown(struct domain *d)
697 {
698 if ( is_viridian_domain(d) )
699 viridian_time_domain_freeze(d);
700 }
701
arch_domain_pause(struct domain * d)702 void arch_domain_pause(struct domain *d)
703 {
704 if ( is_viridian_domain(d) )
705 viridian_time_domain_freeze(d);
706 }
707
arch_domain_unpause(struct domain * d)708 void arch_domain_unpause(struct domain *d)
709 {
710 if ( is_viridian_domain(d) )
711 viridian_time_domain_thaw(d);
712 }
713
arch_domain_soft_reset(struct domain * d)714 int arch_domain_soft_reset(struct domain *d)
715 {
716 struct page_info *page = virt_to_page(d->shared_info), *new_page;
717 int ret = 0;
718 struct domain *owner;
719 mfn_t mfn;
720 gfn_t gfn;
721 p2m_type_t p2mt;
722 unsigned int i;
723
724 /* Soft reset is supported for HVM domains only. */
725 if ( !is_hvm_domain(d) )
726 return -EINVAL;
727
728 spin_lock(&d->event_lock);
729 for ( i = 0; i < d->nr_pirqs ; i++ )
730 {
731 if ( domain_pirq_to_emuirq(d, i) != IRQ_UNBOUND )
732 {
733 ret = unmap_domain_pirq_emuirq(d, i);
734 if ( ret )
735 break;
736 }
737 }
738 spin_unlock(&d->event_lock);
739
740 if ( ret )
741 return ret;
742
743 /*
744 * The shared_info page needs to be replaced with a new page, otherwise we
745 * will get a hole if the domain does XENMAPSPACE_shared_info.
746 */
747
748 owner = page_get_owner_and_reference(page);
749 ASSERT( owner == d );
750
751 mfn = page_to_mfn(page);
752 gfn = mfn_to_gfn(d, mfn);
753
754 /*
755 * gfn == INVALID_GFN indicates that the shared_info page was never mapped
756 * to the domain's address space and there is nothing to replace.
757 */
758 if ( gfn_eq(gfn, INVALID_GFN) )
759 goto exit_put_page;
760
761 if ( !mfn_eq(get_gfn_query(d, gfn_x(gfn), &p2mt), mfn) )
762 {
763 printk(XENLOG_G_ERR
764 "Failed to get %pd's shared_info GFN (%"PRI_gfn")\n",
765 d, gfn_x(gfn));
766 ret = -EINVAL;
767 goto exit_put_gfn;
768 }
769
770 new_page = alloc_domheap_page(d, 0);
771 if ( !new_page )
772 {
773 printk(XENLOG_G_ERR
774 "Failed to alloc a page to replace %pd's shared_info GFN %"PRI_gfn"\n",
775 d, gfn_x(gfn));
776 ret = -ENOMEM;
777 goto exit_put_gfn;
778 }
779
780 ret = guest_physmap_remove_page(d, gfn, mfn, PAGE_ORDER_4K);
781 if ( ret )
782 {
783 printk(XENLOG_G_ERR
784 "Failed to remove %pd's shared_info GFN %"PRI_gfn"\n",
785 d, gfn_x(gfn));
786 free_domheap_page(new_page);
787 goto exit_put_gfn;
788 }
789
790 ret = guest_physmap_add_page(d, gfn, page_to_mfn(new_page),
791 PAGE_ORDER_4K);
792 if ( ret )
793 {
794 printk(XENLOG_G_ERR
795 "Failed to add a page to replace %pd's shared_info frame %"PRI_gfn"\n",
796 d, gfn_x(gfn));
797 free_domheap_page(new_page);
798 }
799 exit_put_gfn:
800 put_gfn(d, gfn_x(gfn));
801 exit_put_page:
802 put_page(page);
803
804 return ret;
805 }
806
arch_domain_creation_finished(struct domain * d)807 void arch_domain_creation_finished(struct domain *d)
808 {
809 }
810
811 #define xen_vcpu_guest_context vcpu_guest_context
812 #define fpu_ctxt fpu_ctxt.x
813 CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt);
814 #undef fpu_ctxt
815 #undef xen_vcpu_guest_context
816
817 /* Called by XEN_DOMCTL_setvcpucontext and VCPUOP_initialise. */
arch_set_info_guest(struct vcpu * v,vcpu_guest_context_u c)818 int arch_set_info_guest(
819 struct vcpu *v, vcpu_guest_context_u c)
820 {
821 struct domain *d = v->domain;
822 unsigned int i;
823 unsigned long flags;
824 bool compat;
825 #ifdef CONFIG_PV
826 mfn_t cr3_mfn;
827 struct page_info *cr3_page = NULL;
828 unsigned int nr_gdt_frames;
829 int rc = 0;
830 #endif
831
832 /* The context is a compat-mode one if the target domain is compat-mode;
833 * we expect the tools to DTRT even in compat-mode callers. */
834 compat = is_pv_32bit_domain(d);
835
836 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
837 flags = c(flags);
838
839 if ( is_pv_domain(d) )
840 {
841 if ( !compat )
842 {
843 if ( !is_canonical_address(c.nat->user_regs.rip) ||
844 !is_canonical_address(c.nat->user_regs.rsp) ||
845 !is_canonical_address(c.nat->kernel_sp) ||
846 (c.nat->ldt_ents && !is_canonical_address(c.nat->ldt_base)) ||
847 !is_canonical_address(c.nat->fs_base) ||
848 !is_canonical_address(c.nat->gs_base_kernel) ||
849 !is_canonical_address(c.nat->gs_base_user) ||
850 !is_canonical_address(c.nat->event_callback_eip) ||
851 !is_canonical_address(c.nat->syscall_callback_eip) ||
852 !is_canonical_address(c.nat->failsafe_callback_eip) )
853 return -EINVAL;
854
855 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
856 fixup_guest_stack_selector(d, c.nat->kernel_ss);
857 fixup_guest_code_selector(d, c.nat->user_regs.cs);
858
859 for ( i = 0; i < ARRAY_SIZE(c.nat->trap_ctxt); i++ )
860 {
861 if ( !is_canonical_address(c.nat->trap_ctxt[i].address) )
862 return -EINVAL;
863 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
864 }
865
866 if ( !__addr_ok(c.nat->ldt_base) )
867 return -EINVAL;
868 }
869 else
870 {
871 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
872 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
873 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
874 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
875 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
876
877 for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); i++ )
878 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
879 }
880
881 /* LDT safety checks. */
882 if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) ||
883 (c(ldt_ents) > 8192) )
884 return -EINVAL;
885
886 v->arch.pv.vgc_flags = flags;
887 }
888
889 v->arch.flags |= TF_kernel_mode;
890 if ( unlikely(!(flags & VGCF_in_kernel)) &&
891 /*
892 * TF_kernel_mode is only allowed to be clear for 64-bit PV. See
893 * update_cr3(), sh_update_cr3(), sh_walk_guest_tables(), and
894 * shadow_one_bit_disable() for why that is.
895 */
896 !is_hvm_domain(d) && !is_pv_32bit_domain(d) )
897 v->arch.flags &= ~TF_kernel_mode;
898
899 vcpu_setup_fpu(v, v->arch.xsave_area,
900 flags & VGCF_I387_VALID ? &c.nat->fpu_ctxt : NULL,
901 FCW_DEFAULT);
902
903 if ( !compat )
904 {
905 memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs));
906 if ( is_pv_domain(d) )
907 memcpy(v->arch.pv.trap_ctxt, c.nat->trap_ctxt,
908 sizeof(c.nat->trap_ctxt));
909 }
910 else
911 {
912 XLAT_cpu_user_regs(&v->arch.user_regs, &c.cmp->user_regs);
913 if ( is_pv_domain(d) )
914 {
915 for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i )
916 XLAT_trap_info(v->arch.pv.trap_ctxt + i,
917 c.cmp->trap_ctxt + i);
918 }
919 }
920
921 if ( v->vcpu_id == 0 && (c(vm_assist) & ~arch_vm_assist_valid_mask(d)) )
922 return -EINVAL;
923
924 if ( is_hvm_domain(d) )
925 {
926 for ( i = 0; i < ARRAY_SIZE(v->arch.dr); ++i )
927 v->arch.dr[i] = c(debugreg[i]);
928 v->arch.dr6 = c(debugreg[6]);
929 v->arch.dr7 = c(debugreg[7]);
930
931 if ( v->vcpu_id == 0 )
932 d->vm_assist = c.nat->vm_assist;
933
934 hvm_set_info_guest(v);
935 goto out;
936 }
937
938 #ifdef CONFIG_PV
939 /* IOPL privileges are virtualised. */
940 v->arch.pv.iopl = v->arch.user_regs.eflags & X86_EFLAGS_IOPL;
941 v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL;
942
943 /* Ensure real hardware interrupts are enabled. */
944 v->arch.user_regs.eflags |= X86_EFLAGS_IF;
945
946 nr_gdt_frames = DIV_ROUND_UP(c(gdt_ents), 512);
947 if ( nr_gdt_frames > ARRAY_SIZE(v->arch.pv.gdt_frames) )
948 return -EINVAL;
949
950 if ( !v->is_initialised )
951 {
952 if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] )
953 return -EINVAL;
954
955 v->arch.pv.ldt_ents = c(ldt_ents);
956 v->arch.pv.ldt_base = v->arch.pv.ldt_ents
957 ? c(ldt_base)
958 : (unsigned long)ZERO_BLOCK_PTR;
959 }
960 else
961 {
962 unsigned long pfn = pagetable_get_pfn(v->arch.guest_table);
963 bool fail;
964
965 if ( !compat )
966 {
967 fail = xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[3];
968 if ( pagetable_is_null(v->arch.guest_table_user) )
969 fail |= c.nat->ctrlreg[1] || !(flags & VGCF_in_kernel);
970 else
971 {
972 pfn = pagetable_get_pfn(v->arch.guest_table_user);
973 fail |= xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[1];
974 }
975 } else {
976 l4_pgentry_t *l4tab = map_domain_page(_mfn(pfn));
977
978 pfn = l4e_get_pfn(*l4tab);
979 unmap_domain_page(l4tab);
980 fail = compat_pfn_to_cr3(pfn) != c.cmp->ctrlreg[3];
981 }
982
983 fail |= v->arch.pv.gdt_ents != c(gdt_ents);
984 for ( i = 0; !fail && i < nr_gdt_frames; ++i )
985 fail = v->arch.pv.gdt_frames[i] != c(gdt_frames[i]);
986
987 fail |= v->arch.pv.ldt_ents != c(ldt_ents);
988 if ( v->arch.pv.ldt_ents )
989 fail |= v->arch.pv.ldt_base != c(ldt_base);
990
991 if ( fail )
992 return -EOPNOTSUPP;
993 }
994
995 v->arch.pv.kernel_ss = c(kernel_ss);
996 v->arch.pv.kernel_sp = c(kernel_sp);
997 for ( i = 0; i < ARRAY_SIZE(v->arch.pv.ctrlreg); ++i )
998 v->arch.pv.ctrlreg[i] = c(ctrlreg[i]);
999
1000 v->arch.pv.event_callback_eip = c(event_callback_eip);
1001 v->arch.pv.failsafe_callback_eip = c(failsafe_callback_eip);
1002 if ( !compat )
1003 {
1004 v->arch.pv.syscall_callback_eip = c.nat->syscall_callback_eip;
1005 v->arch.pv.fs_base = c.nat->fs_base;
1006 v->arch.pv.gs_base_kernel = c.nat->gs_base_kernel;
1007 v->arch.pv.gs_base_user = c.nat->gs_base_user;
1008 }
1009 else
1010 {
1011 v->arch.pv.event_callback_cs = c(event_callback_cs);
1012 v->arch.pv.failsafe_callback_cs = c(failsafe_callback_cs);
1013 }
1014
1015 /* Only CR0.TS is modifiable by guest or admin. */
1016 v->arch.pv.ctrlreg[0] &= X86_CR0_TS;
1017 v->arch.pv.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
1018
1019 v->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(v, v->arch.pv.ctrlreg[4]);
1020
1021 memset(v->arch.dr, 0, sizeof(v->arch.dr));
1022 v->arch.dr6 = X86_DR6_DEFAULT;
1023 v->arch.dr7 = X86_DR7_DEFAULT;
1024 v->arch.pv.dr7_emul = 0;
1025
1026 for ( i = 0; i < ARRAY_SIZE(v->arch.dr); i++ )
1027 set_debugreg(v, i, c(debugreg[i]));
1028 set_debugreg(v, 6, c(debugreg[6]));
1029 set_debugreg(v, 7, c(debugreg[7]));
1030
1031 if ( v->is_initialised )
1032 goto out;
1033
1034 if ( v->vcpu_id == 0 )
1035 {
1036 /*
1037 * In the restore case we need to deal with L4 pages which got
1038 * initialized with m2p_strict still clear (and which hence lack the
1039 * correct initial RO_MPT_VIRT_{START,END} L4 entry).
1040 */
1041 if ( d != current->domain && !VM_ASSIST(d, m2p_strict) &&
1042 is_pv_domain(d) && !is_pv_32bit_domain(d) &&
1043 test_bit(VMASST_TYPE_m2p_strict, &c.nat->vm_assist) &&
1044 atomic_read(&d->arch.pv.nr_l4_pages) )
1045 {
1046 bool done = false;
1047
1048 spin_lock_recursive(&d->page_alloc_lock);
1049
1050 for ( i = 0; ; )
1051 {
1052 struct page_info *page = page_list_remove_head(&d->page_list);
1053
1054 if ( page_lock(page) )
1055 {
1056 if ( (page->u.inuse.type_info & PGT_type_mask) ==
1057 PGT_l4_page_table )
1058 done = !fill_ro_mpt(page_to_mfn(page));
1059
1060 page_unlock(page);
1061 }
1062
1063 page_list_add_tail(page, &d->page_list);
1064
1065 if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
1066 break;
1067 }
1068
1069 spin_unlock_recursive(&d->page_alloc_lock);
1070
1071 if ( !done )
1072 return -ERESTART;
1073 }
1074
1075 d->vm_assist = c(vm_assist);
1076 }
1077
1078 rc = put_old_guest_table(current);
1079 if ( rc )
1080 return rc;
1081
1082 if ( !compat )
1083 rc = pv_set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
1084 else
1085 {
1086 unsigned long gdt_frames[ARRAY_SIZE(v->arch.pv.gdt_frames)];
1087
1088 for ( i = 0; i < nr_gdt_frames; ++i )
1089 gdt_frames[i] = c.cmp->gdt_frames[i];
1090
1091 rc = pv_set_gdt(v, gdt_frames, c.cmp->gdt_ents);
1092 }
1093 if ( rc != 0 )
1094 return rc;
1095
1096 set_bit(_VPF_in_reset, &v->pause_flags);
1097
1098 if ( !compat )
1099 cr3_mfn = _mfn(xen_cr3_to_pfn(c.nat->ctrlreg[3]));
1100 else
1101 cr3_mfn = _mfn(compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
1102 cr3_page = get_page_from_mfn(cr3_mfn, d);
1103
1104 if ( !cr3_page )
1105 rc = -EINVAL;
1106 else if ( paging_mode_refcounts(d) )
1107 /* nothing */;
1108 else if ( cr3_page == v->arch.old_guest_table )
1109 {
1110 v->arch.old_guest_table = NULL;
1111 put_page(cr3_page);
1112 }
1113 else
1114 {
1115 if ( !compat )
1116 rc = put_old_guest_table(v);
1117 if ( !rc )
1118 rc = get_page_type_preemptible(cr3_page,
1119 !compat ? PGT_root_page_table
1120 : PGT_l3_page_table);
1121 switch ( rc )
1122 {
1123 case -EINTR:
1124 rc = -ERESTART;
1125 case -ERESTART:
1126 break;
1127 case 0:
1128 if ( !compat && !VM_ASSIST(d, m2p_strict) &&
1129 !paging_mode_refcounts(d) )
1130 fill_ro_mpt(cr3_mfn);
1131 break;
1132 default:
1133 if ( cr3_page == current->arch.old_guest_table )
1134 cr3_page = NULL;
1135 break;
1136 }
1137 }
1138 if ( rc )
1139 /* handled below */;
1140 else if ( !compat )
1141 {
1142 v->arch.guest_table = pagetable_from_page(cr3_page);
1143 if ( c.nat->ctrlreg[1] )
1144 {
1145 cr3_mfn = _mfn(xen_cr3_to_pfn(c.nat->ctrlreg[1]));
1146 cr3_page = get_page_from_mfn(cr3_mfn, d);
1147
1148 if ( !cr3_page )
1149 rc = -EINVAL;
1150 else if ( !paging_mode_refcounts(d) )
1151 {
1152 rc = get_page_type_preemptible(cr3_page, PGT_root_page_table);
1153 switch ( rc )
1154 {
1155 case -EINTR:
1156 rc = -ERESTART;
1157 /* Fallthrough */
1158 case -ERESTART:
1159 /*
1160 * NB that we're putting the kernel-mode table
1161 * here, which we've already successfully
1162 * validated above; hence partial = false;
1163 */
1164 v->arch.old_guest_ptpg = NULL;
1165 v->arch.old_guest_table =
1166 pagetable_get_page(v->arch.guest_table);
1167 v->arch.old_guest_table_partial = false;
1168 v->arch.guest_table = pagetable_null();
1169 break;
1170 default:
1171 if ( cr3_page == current->arch.old_guest_table )
1172 cr3_page = NULL;
1173 break;
1174 case 0:
1175 if ( VM_ASSIST(d, m2p_strict) )
1176 zap_ro_mpt(cr3_mfn);
1177 break;
1178 }
1179 }
1180 if ( !rc )
1181 v->arch.guest_table_user = pagetable_from_page(cr3_page);
1182 }
1183 }
1184 else
1185 {
1186 l4_pgentry_t *l4tab;
1187
1188 l4tab = map_domain_page(pagetable_get_mfn(v->arch.guest_table));
1189 *l4tab = l4e_from_mfn(page_to_mfn(cr3_page),
1190 _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
1191 unmap_domain_page(l4tab);
1192 }
1193 if ( rc )
1194 {
1195 if ( cr3_page )
1196 put_page(cr3_page);
1197 pv_destroy_gdt(v);
1198 return rc;
1199 }
1200
1201 clear_bit(_VPF_in_reset, &v->pause_flags);
1202
1203 if ( v->vcpu_id == 0 )
1204 update_domain_wallclock_time(d);
1205
1206 /* Don't redo final setup */
1207 v->is_initialised = 1;
1208
1209 if ( paging_mode_enabled(d) )
1210 paging_update_paging_modes(v);
1211
1212 update_cr3(v);
1213 #endif /* CONFIG_PV */
1214
1215 out:
1216 if ( flags & VGCF_online )
1217 clear_bit(_VPF_down, &v->pause_flags);
1218 else
1219 set_bit(_VPF_down, &v->pause_flags);
1220 return 0;
1221 #undef c
1222 }
1223
arch_initialise_vcpu(struct vcpu * v,XEN_GUEST_HANDLE_PARAM (void)arg)1224 int arch_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1225 {
1226 int rc;
1227
1228 if ( is_hvm_vcpu(v) )
1229 {
1230 struct domain *d = v->domain;
1231 struct vcpu_hvm_context ctxt;
1232
1233 if ( copy_from_guest(&ctxt, arg, 1) )
1234 return -EFAULT;
1235
1236 domain_lock(d);
1237 rc = v->is_initialised ? -EEXIST : arch_set_info_hvm_guest(v, &ctxt);
1238 domain_unlock(d);
1239 }
1240 else
1241 rc = default_initialise_vcpu(v, arg);
1242
1243 return rc;
1244 }
1245
arch_vcpu_reset(struct vcpu * v)1246 int arch_vcpu_reset(struct vcpu *v)
1247 {
1248 v->arch.async_exception_mask = 0;
1249 memset(v->arch.async_exception_state, 0,
1250 sizeof(v->arch.async_exception_state));
1251
1252 if ( is_pv_vcpu(v) )
1253 {
1254 pv_destroy_gdt(v);
1255 return vcpu_destroy_pagetables(v);
1256 }
1257
1258 vcpu_end_shutdown_deferral(v);
1259 return 0;
1260 }
1261
1262 long
arch_do_vcpu_op(int cmd,struct vcpu * v,XEN_GUEST_HANDLE_PARAM (void)arg)1263 arch_do_vcpu_op(
1264 int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1265 {
1266 long rc = 0;
1267
1268 switch ( cmd )
1269 {
1270 case VCPUOP_send_nmi:
1271 if ( !guest_handle_is_null(arg) )
1272 return -EINVAL;
1273
1274 if ( !test_and_set_bool(v->arch.nmi_pending) )
1275 vcpu_kick(v);
1276 break;
1277
1278 case VCPUOP_register_vcpu_time_memory_area:
1279 {
1280 struct vcpu_register_time_memory_area area;
1281
1282 rc = -EFAULT;
1283 if ( copy_from_guest(&area, arg, 1) )
1284 break;
1285
1286 if ( !guest_handle_okay(area.addr.h, 1) )
1287 break;
1288
1289 rc = 0;
1290 v->arch.time_info_guest = area.addr.h;
1291
1292 force_update_vcpu_system_time(v);
1293
1294 break;
1295 }
1296
1297 case VCPUOP_get_physid:
1298 {
1299 struct vcpu_get_physid cpu_id;
1300
1301 rc = -EINVAL;
1302 if ( !is_hwdom_pinned_vcpu(v) )
1303 break;
1304
1305 cpu_id.phys_id =
1306 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
1307 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
1308
1309 rc = -EFAULT;
1310 if ( copy_to_guest(arg, &cpu_id, 1) )
1311 break;
1312
1313 rc = 0;
1314 break;
1315 }
1316
1317 default:
1318 rc = -ENOSYS;
1319 break;
1320 }
1321
1322 return rc;
1323 }
1324
1325 /*
1326 * Notes on PV segment handling:
1327 * - 32bit: All data from the GDT/LDT.
1328 * - 64bit: In addition, 64bit FS/GS/GS_KERN bases.
1329 *
1330 * Linux's ABI with userspace expects to preserve the full selector and
1331 * segment base, even sel != NUL, base != GDT/LDT for 64bit code. Xen must
1332 * honour this when context switching, to avoid breaking Linux's ABI.
1333 *
1334 * Note: It is impossible to preserve a selector value of 1, 2 or 3, as these
1335 * get reset to 0 by an IRET back to guest context. Code playing with
1336 * arcane corners of x86 get to keep all resulting pieces.
1337 *
1338 * Therefore, we:
1339 * - Load the LDT.
1340 * - Load each segment selector.
1341 * - Any error loads zero, and triggers a failsafe callback.
1342 * - For 64bit, further load the 64bit bases.
1343 *
1344 * An optimisation exists on SVM-capable hardware, where we use a VMLOAD
1345 * instruction to load the LDT and full FS/GS/GS_KERN data in one go.
1346 *
1347 * AMD-like CPUs prior to Zen2 do not zero the segment base or limit when
1348 * loading a NUL selector. This is a problem in principle when context
1349 * switching to a 64bit guest, as a NUL FS/GS segment is usable and will pick
1350 * up the stale base.
1351 *
1352 * However, it is not an issue in practice. NUL segments are unusable for
1353 * 32bit guests (so any stale base won't be used), and we unconditionally
1354 * write the full FS/GS bases for 64bit guests.
1355 */
load_segments(struct vcpu * n)1356 static void load_segments(struct vcpu *n)
1357 {
1358 struct cpu_user_regs *uregs = &n->arch.user_regs;
1359 bool compat = is_pv_32bit_vcpu(n);
1360 bool all_segs_okay = true, fs_gs_done = false;
1361
1362 /*
1363 * Attempt to load @seg with selector @val. On error, clear
1364 * @all_segs_okay in function scope, and load NUL into @sel.
1365 */
1366 #define TRY_LOAD_SEG(seg, val) \
1367 asm volatile ( "1: mov %k[_val], %%" #seg "\n\t" \
1368 "2:\n\t" \
1369 ".section .fixup, \"ax\"\n\t" \
1370 "3: xor %k[ok], %k[ok]\n\t" \
1371 " mov %k[ok], %%" #seg "\n\t" \
1372 " jmp 2b\n\t" \
1373 ".previous\n\t" \
1374 _ASM_EXTABLE(1b, 3b) \
1375 : [ok] "+r" (all_segs_okay) \
1376 : [_val] "rm" (val) )
1377
1378 #ifdef CONFIG_HVM
1379 if ( cpu_has_svm && !compat && (uregs->fs | uregs->gs) <= 3 )
1380 {
1381 unsigned long gsb = n->arch.flags & TF_kernel_mode
1382 ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user;
1383 unsigned long gss = n->arch.flags & TF_kernel_mode
1384 ? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel;
1385
1386 fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
1387 n->arch.pv.fs_base, gsb, gss);
1388 }
1389 #endif
1390 if ( !fs_gs_done )
1391 {
1392 load_LDT(n);
1393
1394 TRY_LOAD_SEG(fs, uregs->fs);
1395 TRY_LOAD_SEG(gs, uregs->gs);
1396 }
1397
1398 TRY_LOAD_SEG(ds, uregs->ds);
1399 TRY_LOAD_SEG(es, uregs->es);
1400
1401 if ( !fs_gs_done && !compat )
1402 {
1403 wrfsbase(n->arch.pv.fs_base);
1404 wrgsshadow(n->arch.pv.gs_base_kernel);
1405 wrgsbase(n->arch.pv.gs_base_user);
1406
1407 /* If in kernel mode then switch the GS bases around. */
1408 if ( (n->arch.flags & TF_kernel_mode) )
1409 asm volatile ( "swapgs" );
1410 }
1411
1412 if ( unlikely(!all_segs_okay) )
1413 {
1414 struct pv_vcpu *pv = &n->arch.pv;
1415 struct cpu_user_regs *regs = guest_cpu_user_regs();
1416 unsigned long *rsp =
1417 (unsigned long *)(((n->arch.flags & TF_kernel_mode)
1418 ? regs->rsp : pv->kernel_sp) & ~0xf);
1419 unsigned long cs_and_mask, rflags;
1420
1421 /* Fold upcall mask and architectural IOPL into RFLAGS.IF. */
1422 rflags = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1423 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1424 if ( VM_ASSIST(n->domain, architectural_iopl) )
1425 rflags |= n->arch.pv.iopl;
1426
1427 if ( is_pv_32bit_vcpu(n) )
1428 {
1429 unsigned int *esp = ring_1(regs) ?
1430 (unsigned int *)regs->rsp :
1431 (unsigned int *)pv->kernel_sp;
1432 int ret = 0;
1433
1434 /* CS longword also contains full evtchn_upcall_mask. */
1435 cs_and_mask = (unsigned short)regs->cs |
1436 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1437
1438 if ( !ring_1(regs) )
1439 {
1440 ret = put_user(regs->ss, esp-1);
1441 ret |= put_user(regs->esp, esp-2);
1442 esp -= 2;
1443 }
1444
1445 if ( ret |
1446 put_user(rflags, esp-1) |
1447 put_user(cs_and_mask, esp-2) |
1448 put_user(regs->eip, esp-3) |
1449 put_user(uregs->gs, esp-4) |
1450 put_user(uregs->fs, esp-5) |
1451 put_user(uregs->es, esp-6) |
1452 put_user(uregs->ds, esp-7) )
1453 {
1454 gprintk(XENLOG_ERR,
1455 "error while creating compat failsafe callback frame\n");
1456 domain_crash(n->domain);
1457 }
1458
1459 if ( n->arch.pv.vgc_flags & VGCF_failsafe_disables_events )
1460 vcpu_info(n, evtchn_upcall_mask) = 1;
1461
1462 regs->entry_vector |= TRAP_syscall;
1463 regs->eflags &= ~(X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT|
1464 X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1465 regs->ss = FLAT_COMPAT_KERNEL_SS;
1466 regs->esp = (unsigned long)(esp-7);
1467 regs->cs = FLAT_COMPAT_KERNEL_CS;
1468 regs->eip = pv->failsafe_callback_eip;
1469 return;
1470 }
1471
1472 if ( !(n->arch.flags & TF_kernel_mode) )
1473 toggle_guest_mode(n);
1474 else
1475 regs->cs &= ~3;
1476
1477 /* CS longword also contains full evtchn_upcall_mask. */
1478 cs_and_mask = (unsigned long)regs->cs |
1479 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1480
1481 if ( put_user(regs->ss, rsp- 1) |
1482 put_user(regs->rsp, rsp- 2) |
1483 put_user(rflags, rsp- 3) |
1484 put_user(cs_and_mask, rsp- 4) |
1485 put_user(regs->rip, rsp- 5) |
1486 put_user(uregs->gs, rsp- 6) |
1487 put_user(uregs->fs, rsp- 7) |
1488 put_user(uregs->es, rsp- 8) |
1489 put_user(uregs->ds, rsp- 9) |
1490 put_user(regs->r11, rsp-10) |
1491 put_user(regs->rcx, rsp-11) )
1492 {
1493 gprintk(XENLOG_ERR,
1494 "error while creating failsafe callback frame\n");
1495 domain_crash(n->domain);
1496 }
1497
1498 if ( n->arch.pv.vgc_flags & VGCF_failsafe_disables_events )
1499 vcpu_info(n, evtchn_upcall_mask) = 1;
1500
1501 regs->entry_vector |= TRAP_syscall;
1502 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1503 X86_EFLAGS_NT|X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1504 regs->ss = FLAT_KERNEL_SS;
1505 regs->rsp = (unsigned long)(rsp-11);
1506 regs->cs = FLAT_KERNEL_CS;
1507 regs->rip = pv->failsafe_callback_eip;
1508 }
1509 }
1510
1511 /*
1512 * Record all guest segment state. The guest can load segment selectors
1513 * without trapping, which will also alter the 64bit FS/GS bases. Arbitrary
1514 * changes to bases can also be made with the WR{FS,GS}BASE instructions, when
1515 * enabled.
1516 *
1517 * Guests however cannot use SWAPGS, so there is no mechanism to modify the
1518 * inactive GS base behind Xen's back. Therefore, Xen's copy of the inactive
1519 * GS base is still accurate, and doesn't need reading back from hardware.
1520 */
save_segments(struct vcpu * v)1521 static void save_segments(struct vcpu *v)
1522 {
1523 struct cpu_user_regs *regs = &v->arch.user_regs;
1524
1525 regs->ds = read_sreg(ds);
1526 regs->es = read_sreg(es);
1527 regs->fs = read_sreg(fs);
1528 regs->gs = read_sreg(gs);
1529
1530 if ( !is_pv_32bit_vcpu(v) )
1531 {
1532 unsigned long gs_base = rdgsbase();
1533
1534 v->arch.pv.fs_base = rdfsbase();
1535 if ( v->arch.flags & TF_kernel_mode )
1536 v->arch.pv.gs_base_kernel = gs_base;
1537 else
1538 v->arch.pv.gs_base_user = gs_base;
1539 }
1540 }
1541
paravirt_ctxt_switch_from(struct vcpu * v)1542 void paravirt_ctxt_switch_from(struct vcpu *v)
1543 {
1544 save_segments(v);
1545
1546 /*
1547 * Disable debug breakpoints. We do this aggressively because if we switch
1548 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1549 * inside Xen, before we get a chance to reload DR7, and this cannot always
1550 * safely be handled.
1551 */
1552 if ( unlikely(v->arch.dr7 & DR7_ACTIVE_MASK) )
1553 write_debugreg(7, 0);
1554 }
1555
paravirt_ctxt_switch_to(struct vcpu * v)1556 void paravirt_ctxt_switch_to(struct vcpu *v)
1557 {
1558 root_pgentry_t *root_pgt = this_cpu(root_pgt);
1559
1560 if ( root_pgt )
1561 root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] =
1562 l4e_from_page(v->domain->arch.perdomain_l3_pg,
1563 __PAGE_HYPERVISOR_RW);
1564
1565 if ( unlikely(v->arch.dr7 & DR7_ACTIVE_MASK) )
1566 activate_debugregs(v);
1567
1568 if ( cpu_has_msr_tsc_aux )
1569 wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
1570 }
1571
1572 /* Update per-VCPU guest runstate shared memory area (if registered). */
update_runstate_area(struct vcpu * v)1573 bool update_runstate_area(struct vcpu *v)
1574 {
1575 bool rc;
1576 struct guest_memory_policy policy = { .nested_guest_mode = false };
1577 void __user *guest_handle = NULL;
1578 struct vcpu_runstate_info runstate;
1579
1580 if ( guest_handle_is_null(runstate_guest(v)) )
1581 return true;
1582
1583 update_guest_memory_policy(v, &policy);
1584
1585 memcpy(&runstate, &v->runstate, sizeof(runstate));
1586
1587 if ( VM_ASSIST(v->domain, runstate_update_flag) )
1588 {
1589 guest_handle = has_32bit_shinfo(v->domain)
1590 ? &v->runstate_guest.compat.p->state_entry_time + 1
1591 : &v->runstate_guest.native.p->state_entry_time + 1;
1592 guest_handle--;
1593 runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
1594 __raw_copy_to_guest(guest_handle,
1595 (void *)(&runstate.state_entry_time + 1) - 1, 1);
1596 smp_wmb();
1597 }
1598
1599 if ( has_32bit_shinfo(v->domain) )
1600 {
1601 struct compat_vcpu_runstate_info info;
1602
1603 XLAT_vcpu_runstate_info(&info, &runstate);
1604 __copy_to_guest(v->runstate_guest.compat, &info, 1);
1605 rc = true;
1606 }
1607 else
1608 rc = __copy_to_guest(runstate_guest(v), &runstate, 1) !=
1609 sizeof(runstate);
1610
1611 if ( guest_handle )
1612 {
1613 runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
1614 smp_wmb();
1615 __raw_copy_to_guest(guest_handle,
1616 (void *)(&runstate.state_entry_time + 1) - 1, 1);
1617 }
1618
1619 update_guest_memory_policy(v, &policy);
1620
1621 return rc;
1622 }
1623
_update_runstate_area(struct vcpu * v)1624 static void _update_runstate_area(struct vcpu *v)
1625 {
1626 if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
1627 !(v->arch.flags & TF_kernel_mode) )
1628 v->arch.pv.need_update_runstate_area = 1;
1629 }
1630
1631 /*
1632 * Overview of Xen's GDTs.
1633 *
1634 * Xen maintains per-CPU compat and regular GDTs which are both a single page
1635 * in size. Some content is specific to each CPU (the TSS, the per-CPU marker
1636 * for #DF handling, and optionally the LDT). The compat and regular GDTs
1637 * differ by the layout and content of the guest accessible selectors.
1638 *
1639 * The Xen selectors live from 0xe000 (slot 14 of 16), and need to always
1640 * appear in this position for interrupt/exception handling to work.
1641 *
1642 * A PV guest may specify GDT frames of their own (slots 0 to 13). Room for a
1643 * full GDT exists in the per-domain mappings.
1644 *
1645 * To schedule a PV vcpu, we point slot 14 of the guest's full GDT at the
1646 * current CPU's compat or regular (as appropriate) GDT frame. This is so
1647 * that the per-CPU parts still work correctly after switching pagetables and
1648 * loading the guests full GDT into GDTR.
1649 *
1650 * To schedule Idle or HVM vcpus, we load a GDT base address which causes the
1651 * regular per-CPU GDT frame to appear with selectors at the appropriate
1652 * offset.
1653 */
need_full_gdt(const struct domain * d)1654 static always_inline bool need_full_gdt(const struct domain *d)
1655 {
1656 return is_pv_domain(d) && !is_idle_domain(d);
1657 }
1658
update_xen_slot_in_full_gdt(const struct vcpu * v,unsigned int cpu)1659 static void update_xen_slot_in_full_gdt(const struct vcpu *v, unsigned int cpu)
1660 {
1661 l1e_write(pv_gdt_ptes(v) + FIRST_RESERVED_GDT_PAGE,
1662 !is_pv_32bit_vcpu(v) ? per_cpu(gdt_l1e, cpu)
1663 : per_cpu(compat_gdt_l1e, cpu));
1664 }
1665
load_full_gdt(const struct vcpu * v,unsigned int cpu)1666 static void load_full_gdt(const struct vcpu *v, unsigned int cpu)
1667 {
1668 struct desc_ptr gdt_desc = {
1669 .limit = LAST_RESERVED_GDT_BYTE,
1670 .base = GDT_VIRT_START(v),
1671 };
1672
1673 lgdt(&gdt_desc);
1674
1675 per_cpu(full_gdt_loaded, cpu) = true;
1676 }
1677
load_default_gdt(unsigned int cpu)1678 static void load_default_gdt(unsigned int cpu)
1679 {
1680 struct desc_ptr gdt_desc = {
1681 .limit = LAST_RESERVED_GDT_BYTE,
1682 .base = (unsigned long)(per_cpu(gdt, cpu) - FIRST_RESERVED_GDT_ENTRY),
1683 };
1684
1685 lgdt(&gdt_desc);
1686
1687 per_cpu(full_gdt_loaded, cpu) = false;
1688 }
1689
__context_switch(void)1690 static void __context_switch(void)
1691 {
1692 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1693 unsigned int cpu = smp_processor_id();
1694 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1695 struct vcpu *n = current;
1696 struct domain *pd = p->domain, *nd = n->domain;
1697
1698 ASSERT(p != n);
1699 ASSERT(!vcpu_cpu_dirty(n));
1700
1701 if ( !is_idle_domain(pd) )
1702 {
1703 ASSERT(read_atomic(&p->dirty_cpu) == cpu);
1704 memcpy(&p->arch.user_regs, stack_regs, CTXT_SWITCH_STACK_BYTES);
1705 vcpu_save_fpu(p);
1706 pd->arch.ctxt_switch->from(p);
1707 }
1708
1709 /*
1710 * Mark this CPU in next domain's dirty cpumasks before calling
1711 * ctxt_switch_to(). This avoids a race on things like EPT flushing,
1712 * which is synchronised on that function.
1713 */
1714 if ( pd != nd )
1715 cpumask_set_cpu(cpu, nd->dirty_cpumask);
1716 write_atomic(&n->dirty_cpu, cpu);
1717
1718 if ( !is_idle_domain(nd) )
1719 {
1720 memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES);
1721 if ( cpu_has_xsave )
1722 {
1723 u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE;
1724
1725 if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) )
1726 BUG();
1727
1728 if ( cpu_has_xsaves && is_hvm_vcpu(n) )
1729 set_msr_xss(n->arch.msrs->xss.raw);
1730 }
1731 vcpu_restore_fpu_nonlazy(n, false);
1732 nd->arch.ctxt_switch->to(n);
1733 }
1734
1735 psr_ctxt_switch_to(nd);
1736
1737 if ( need_full_gdt(nd) )
1738 update_xen_slot_in_full_gdt(n, cpu);
1739
1740 if ( per_cpu(full_gdt_loaded, cpu) &&
1741 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(nd)) )
1742 load_default_gdt(cpu);
1743
1744 write_ptbase(n);
1745
1746 #if defined(CONFIG_PV) && defined(CONFIG_HVM)
1747 /* Prefetch the VMCB if we expect to use it later in the context switch */
1748 if ( cpu_has_svm && is_pv_domain(nd) && !is_pv_32bit_domain(nd) &&
1749 !is_idle_domain(nd) )
1750 svm_load_segs(0, 0, 0, 0, 0);
1751 #endif
1752
1753 if ( need_full_gdt(nd) && !per_cpu(full_gdt_loaded, cpu) )
1754 load_full_gdt(n, cpu);
1755
1756 if ( pd != nd )
1757 cpumask_clear_cpu(cpu, pd->dirty_cpumask);
1758 write_atomic(&p->dirty_cpu, VCPU_CPU_CLEAN);
1759
1760 per_cpu(curr_vcpu, cpu) = n;
1761 }
1762
context_switch(struct vcpu * prev,struct vcpu * next)1763 void context_switch(struct vcpu *prev, struct vcpu *next)
1764 {
1765 unsigned int cpu = smp_processor_id();
1766 const struct domain *prevd = prev->domain, *nextd = next->domain;
1767 unsigned int dirty_cpu = read_atomic(&next->dirty_cpu);
1768
1769 ASSERT(prev != next);
1770 ASSERT(local_irq_is_enabled());
1771
1772 get_cpu_info()->use_pv_cr3 = false;
1773 get_cpu_info()->xen_cr3 = 0;
1774
1775 if ( unlikely(dirty_cpu != cpu) && dirty_cpu != VCPU_CPU_CLEAN )
1776 {
1777 /* Remote CPU calls __sync_local_execstate() from flush IPI handler. */
1778 flush_mask(cpumask_of(dirty_cpu), FLUSH_VCPU_STATE);
1779 ASSERT(!vcpu_cpu_dirty(next));
1780 }
1781
1782 _update_runstate_area(prev);
1783 vpmu_switch_from(prev);
1784 np2m_schedule(NP2M_SCHEDLE_OUT);
1785
1786 if ( is_hvm_domain(prevd) && !list_empty(&prev->arch.hvm.tm_list) )
1787 pt_save_timer(prev);
1788
1789 local_irq_disable();
1790
1791 set_current(next);
1792
1793 if ( (per_cpu(curr_vcpu, cpu) == next) ||
1794 (is_idle_domain(nextd) && cpu_online(cpu)) )
1795 {
1796 local_irq_enable();
1797 }
1798 else
1799 {
1800 __context_switch();
1801
1802 /* Re-enable interrupts before restoring state which may fault. */
1803 local_irq_enable();
1804
1805 if ( is_pv_domain(nextd) )
1806 load_segments(next);
1807
1808 ctxt_switch_levelling(next);
1809
1810 if ( opt_ibpb && !is_idle_domain(nextd) )
1811 {
1812 static DEFINE_PER_CPU(unsigned int, last);
1813 unsigned int *last_id = &this_cpu(last);
1814
1815 /*
1816 * Squash the domid and vcpu id together for comparison
1817 * efficiency. We could in principle stash and compare the struct
1818 * vcpu pointer, but this risks a false alias if a domain has died
1819 * and the same 4k page gets reused for a new vcpu.
1820 */
1821 unsigned int next_id = (((unsigned int)nextd->domain_id << 16) |
1822 (uint16_t)next->vcpu_id);
1823 BUILD_BUG_ON(MAX_VIRT_CPUS > 0xffff);
1824
1825 /*
1826 * When scheduling from a vcpu, to idle, and back to the same vcpu
1827 * (which might be common in a lightly loaded system, or when
1828 * using vcpu pinning), there is no need to issue IBPB, as we are
1829 * returning to the same security context.
1830 */
1831 if ( *last_id != next_id )
1832 {
1833 wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB);
1834 *last_id = next_id;
1835 }
1836 }
1837 }
1838
1839 sched_context_switched(prev, next);
1840
1841 _update_runstate_area(next);
1842 /* Must be done with interrupts enabled */
1843 vpmu_switch_to(next);
1844 np2m_schedule(NP2M_SCHEDLE_IN);
1845
1846 /* Ensure that the vcpu has an up-to-date time base. */
1847 update_vcpu_system_time(next);
1848
1849 reset_stack_and_jump_ind(nextd->arch.ctxt_switch->tail);
1850 }
1851
continue_running(struct vcpu * same)1852 void continue_running(struct vcpu *same)
1853 {
1854 reset_stack_and_jump_ind(same->domain->arch.ctxt_switch->tail);
1855 }
1856
__sync_local_execstate(void)1857 int __sync_local_execstate(void)
1858 {
1859 unsigned long flags;
1860 int switch_required;
1861
1862 local_irq_save(flags);
1863
1864 switch_required = (this_cpu(curr_vcpu) != current);
1865
1866 if ( switch_required )
1867 {
1868 ASSERT(current == idle_vcpu[smp_processor_id()]);
1869 __context_switch();
1870 }
1871
1872 local_irq_restore(flags);
1873
1874 return switch_required;
1875 }
1876
sync_local_execstate(void)1877 void sync_local_execstate(void)
1878 {
1879 (void)__sync_local_execstate();
1880 }
1881
sync_vcpu_execstate(struct vcpu * v)1882 void sync_vcpu_execstate(struct vcpu *v)
1883 {
1884 unsigned int dirty_cpu = read_atomic(&v->dirty_cpu);
1885
1886 if ( dirty_cpu == smp_processor_id() )
1887 sync_local_execstate();
1888 else if ( is_vcpu_dirty_cpu(dirty_cpu) )
1889 {
1890 /* Remote CPU calls __sync_local_execstate() from flush IPI handler. */
1891 flush_mask(cpumask_of(dirty_cpu), FLUSH_VCPU_STATE);
1892 }
1893 ASSERT(!is_vcpu_dirty_cpu(dirty_cpu) ||
1894 read_atomic(&v->dirty_cpu) != dirty_cpu);
1895 }
1896
relinquish_memory(struct domain * d,struct page_list_head * list,unsigned long type)1897 static int relinquish_memory(
1898 struct domain *d, struct page_list_head *list, unsigned long type)
1899 {
1900 struct page_info *page;
1901 unsigned long x, y;
1902 int ret = 0;
1903
1904 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1905 spin_lock_recursive(&d->page_alloc_lock);
1906
1907 while ( (page = page_list_remove_head(list)) )
1908 {
1909 /* Grab a reference to the page so it won't disappear from under us. */
1910 if ( unlikely(!get_page(page, d)) )
1911 {
1912 /* Couldn't get a reference -- someone is freeing this page. */
1913 page_list_add_tail(page, &d->arch.relmem_list);
1914 continue;
1915 }
1916
1917 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1918 ret = put_page_and_type_preemptible(page);
1919 switch ( ret )
1920 {
1921 case 0:
1922 break;
1923 case -ERESTART:
1924 case -EINTR:
1925 /*
1926 * -EINTR means PGT_validated has been re-set; re-set
1927 * PGT_pinned again so that it gets picked up next time
1928 * around.
1929 *
1930 * -ERESTART, OTOH, means PGT_partial is set instead. Put
1931 * it back on the list, but don't set PGT_pinned; the
1932 * section below will finish off de-validation. But we do
1933 * need to drop the general ref associated with
1934 * PGT_pinned, since put_page_and_type_preemptible()
1935 * didn't do it.
1936 *
1937 * NB we can do an ASSERT for PGT_validated, since we
1938 * "own" the type ref; but theoretically, the PGT_partial
1939 * could be cleared by someone else.
1940 */
1941 if ( ret == -EINTR )
1942 {
1943 ASSERT(page->u.inuse.type_info & PGT_validated);
1944 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1945 }
1946 else
1947 put_page(page);
1948
1949 ret = -ERESTART;
1950
1951 /* Put the page back on the list and drop the ref we grabbed above */
1952 page_list_add(page, list);
1953 put_page(page);
1954 goto out;
1955 default:
1956 BUG();
1957 }
1958
1959 put_page_alloc_ref(page);
1960
1961 /*
1962 * Forcibly invalidate top-most, still valid page tables at this point
1963 * to break circular 'linear page table' references as well as clean up
1964 * partially validated pages. This is okay because MMU structures are
1965 * not shared across domains and this domain is now dead. Thus top-most
1966 * valid tables are not in use so a non-zero count means circular
1967 * reference or partially validated.
1968 */
1969 y = page->u.inuse.type_info;
1970 for ( ; ; )
1971 {
1972 x = y;
1973 if ( likely((x & PGT_type_mask) != type) ||
1974 likely(!(x & (PGT_validated|PGT_partial))) )
1975 break;
1976
1977 y = cmpxchg(&page->u.inuse.type_info, x,
1978 x & ~(PGT_validated|PGT_partial));
1979 if ( likely(y == x) )
1980 {
1981 /* No need for atomic update of type_info here: noone else updates it. */
1982 switch ( ret = devalidate_page(page, x, 1) )
1983 {
1984 case 0:
1985 break;
1986 case -EINTR:
1987 page_list_add(page, list);
1988 page->u.inuse.type_info |= PGT_validated;
1989 if ( x & PGT_partial )
1990 put_page(page);
1991 put_page(page);
1992 ret = -ERESTART;
1993 goto out;
1994 case -ERESTART:
1995 page_list_add(page, list);
1996 /*
1997 * PGT_partial holds a type ref and a general ref.
1998 * If we came in with PGT_partial set, then we 1)
1999 * don't need to grab an extra type count, and 2)
2000 * do need to drop the extra page ref we grabbed
2001 * at the top of the loop. If we didn't come in
2002 * with PGT_partial set, we 1) do need to drab an
2003 * extra type count, but 2) can transfer the page
2004 * ref we grabbed above to it.
2005 *
2006 * Note that we must increment type_info before
2007 * setting PGT_partial. Theoretically it should
2008 * be safe to drop the page ref before setting
2009 * PGT_partial, but do it afterwards just to be
2010 * extra safe.
2011 */
2012 if ( !(x & PGT_partial) )
2013 page->u.inuse.type_info++;
2014 smp_wmb();
2015 page->u.inuse.type_info |= PGT_partial;
2016 if ( x & PGT_partial )
2017 put_page(page);
2018 goto out;
2019 default:
2020 BUG();
2021 }
2022 if ( x & PGT_partial )
2023 {
2024 page->u.inuse.type_info--;
2025 put_page(page);
2026 }
2027 break;
2028 }
2029 }
2030
2031 /* Put the page on the list and /then/ potentially free it. */
2032 page_list_add_tail(page, &d->arch.relmem_list);
2033 put_page(page);
2034
2035 if ( hypercall_preempt_check() )
2036 {
2037 ret = -ERESTART;
2038 goto out;
2039 }
2040 }
2041
2042 /* list is empty at this point. */
2043 page_list_move(list, &d->arch.relmem_list);
2044
2045 out:
2046 spin_unlock_recursive(&d->page_alloc_lock);
2047 return ret;
2048 }
2049
domain_relinquish_resources(struct domain * d)2050 int domain_relinquish_resources(struct domain *d)
2051 {
2052 int ret;
2053 struct vcpu *v;
2054
2055 BUG_ON(!cpumask_empty(d->dirty_cpumask));
2056
2057 /*
2058 * This hypercall can take minutes of wallclock time to complete. This
2059 * logic implements a co-routine, stashing state in struct domain across
2060 * hypercall continuation boundaries.
2061 */
2062 switch ( d->arch.rel_priv )
2063 {
2064 /*
2065 * Record the current progress. Subsequent hypercall continuations
2066 * will logically restart work from this point.
2067 *
2068 * PROGRESS() markers must not be in the middle of loops. The loop
2069 * variable isn't preserved across a continuation.
2070 *
2071 * To avoid redundant work, there should be a marker before each
2072 * function which may return -ERESTART.
2073 */
2074 #define PROGRESS(x) \
2075 d->arch.rel_priv = PROG_ ## x; /* Fallthrough */ case PROG_ ## x
2076
2077 enum {
2078 PROG_paging = 1,
2079 PROG_vcpu_pagetables,
2080 PROG_shared,
2081 PROG_xen,
2082 PROG_l4,
2083 PROG_l3,
2084 PROG_l2,
2085 PROG_done,
2086 };
2087
2088 case 0:
2089 ret = pci_release_devices(d);
2090 if ( ret )
2091 return ret;
2092
2093 PROGRESS(paging):
2094
2095 /* Tear down paging-assistance stuff. */
2096 ret = paging_teardown(d);
2097 if ( ret )
2098 return ret;
2099
2100 PROGRESS(vcpu_pagetables):
2101
2102 /*
2103 * Drop the in-use references to page-table bases and clean
2104 * up vPMU instances.
2105 */
2106 for_each_vcpu ( d, v )
2107 {
2108 ret = vcpu_destroy_pagetables(v);
2109 if ( ret )
2110 return ret;
2111
2112 vpmu_destroy(v);
2113 }
2114
2115 if ( altp2m_active(d) )
2116 {
2117 for_each_vcpu ( d, v )
2118 altp2m_vcpu_disable_ve(v);
2119 }
2120
2121 if ( is_pv_domain(d) )
2122 {
2123 for_each_vcpu ( d, v )
2124 {
2125 /* Relinquish GDT/LDT mappings. */
2126 pv_destroy_ldt(v);
2127 pv_destroy_gdt(v);
2128 }
2129 }
2130
2131 if ( d->arch.pirq_eoi_map != NULL )
2132 {
2133 unmap_domain_page_global(d->arch.pirq_eoi_map);
2134 put_page_and_type(mfn_to_page(_mfn(d->arch.pirq_eoi_map_mfn)));
2135 d->arch.pirq_eoi_map = NULL;
2136 d->arch.auto_unmask = 0;
2137 }
2138
2139 #ifdef CONFIG_MEM_SHARING
2140 PROGRESS(shared):
2141
2142 if ( is_hvm_domain(d) )
2143 {
2144 /* If the domain has shared pages, relinquish them allowing
2145 * for preemption. */
2146 ret = relinquish_shared_pages(d);
2147 if ( ret )
2148 return ret;
2149
2150 /*
2151 * If the domain is forked, decrement the parent's pause count
2152 * and release the domain.
2153 */
2154 if ( mem_sharing_is_fork(d) )
2155 {
2156 struct domain *parent = d->parent;
2157
2158 d->parent = NULL;
2159 domain_unpause(parent);
2160 put_domain(parent);
2161 }
2162 }
2163 #endif
2164
2165 spin_lock(&d->page_alloc_lock);
2166 page_list_splice(&d->arch.relmem_list, &d->page_list);
2167 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
2168 spin_unlock(&d->page_alloc_lock);
2169
2170 PROGRESS(xen):
2171
2172 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
2173 if ( ret )
2174 return ret;
2175
2176 PROGRESS(l4):
2177
2178 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
2179 if ( ret )
2180 return ret;
2181
2182 PROGRESS(l3):
2183
2184 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
2185 if ( ret )
2186 return ret;
2187
2188 PROGRESS(l2):
2189
2190 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
2191 if ( ret )
2192 return ret;
2193
2194 PROGRESS(done):
2195 break;
2196
2197 #undef PROGRESS
2198
2199 default:
2200 BUG();
2201 }
2202
2203 pit_deinit(d);
2204
2205 if ( is_hvm_domain(d) )
2206 hvm_domain_relinquish_resources(d);
2207
2208 return 0;
2209 }
2210
2211 /*
2212 * Called during vcpu construction, and each time the toolstack changes the
2213 * CPUID configuration for the domain.
2214 */
cpuid_policy_updated(struct vcpu * v)2215 void cpuid_policy_updated(struct vcpu *v)
2216 {
2217 if ( is_hvm_vcpu(v) )
2218 hvm_cpuid_policy_changed(v);
2219 }
2220
arch_dump_domain_info(struct domain * d)2221 void arch_dump_domain_info(struct domain *d)
2222 {
2223 paging_dump_domain_info(d);
2224 }
2225
arch_dump_vcpu_info(struct vcpu * v)2226 void arch_dump_vcpu_info(struct vcpu *v)
2227 {
2228 paging_dump_vcpu_info(v);
2229
2230 vpmu_dump(v);
2231 }
2232
vcpu_kick(struct vcpu * v)2233 void vcpu_kick(struct vcpu *v)
2234 {
2235 /*
2236 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2237 * pending flag. These values may fluctuate (after all, we hold no
2238 * locks) but the key insight is that each change will cause
2239 * evtchn_upcall_pending to be polled.
2240 *
2241 * NB2. We save the running flag across the unblock to avoid a needless
2242 * IPI for domains that we IPI'd to unblock.
2243 */
2244 bool running = v->is_running;
2245
2246 vcpu_unblock(v);
2247 if ( running && (in_irq() || (v != current)) )
2248 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2249 }
2250
vcpu_mark_events_pending(struct vcpu * v)2251 void vcpu_mark_events_pending(struct vcpu *v)
2252 {
2253 int already_pending = test_and_set_bit(
2254 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2255
2256 if ( already_pending )
2257 return;
2258
2259 if ( is_hvm_vcpu(v) )
2260 hvm_assert_evtchn_irq(v);
2261 else
2262 vcpu_kick(v);
2263 }
2264
vcpu_kick_softirq(void)2265 static void vcpu_kick_softirq(void)
2266 {
2267 /*
2268 * Nothing to do here: we merely prevent notifiers from racing with checks
2269 * executed on return to guest context with interrupts enabled. See, for
2270 * example, xxx_intr_assist() executed on return to HVM guest context.
2271 */
2272 }
2273
init_vcpu_kick_softirq(void)2274 static int __init init_vcpu_kick_softirq(void)
2275 {
2276 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2277 return 0;
2278 }
2279 __initcall(init_vcpu_kick_softirq);
2280
domain_pause_for_debugger(void)2281 void domain_pause_for_debugger(void)
2282 {
2283 #ifdef CONFIG_CRASH_DEBUG
2284 struct vcpu *curr = current;
2285 struct domain *d = curr->domain;
2286
2287 domain_pause_by_systemcontroller_nosync(d);
2288
2289 /* if gdbsx active, we just need to pause the domain */
2290 if ( curr->arch.gdbsx_vcpu_event == 0 )
2291 send_global_virq(VIRQ_DEBUGGER);
2292 #endif
2293 }
2294
2295 /*
2296 * Local variables:
2297 * mode: C
2298 * c-file-style: "BSD"
2299 * c-basic-offset: 4
2300 * tab-width: 4
2301 * indent-tabs-mode: nil
2302 * End:
2303 */
2304