1 /******************************************************************************
2  * arch/x86/domain.c
3  *
4  * x86-specific domain handling (e.g., register setup and context switching).
5  */
6 
7 /*
8  *  Copyright (C) 1995  Linus Torvalds
9  *
10  *  Pentium III FXSR, SSE support
11  *  Gareth Hughes <gareth@valinux.com>, May 2000
12  */
13 
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/domain.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <xen/kernel.h>
25 #include <xen/hypercall.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <xen/cpu.h>
36 #include <xen/wait.h>
37 #include <xen/guest_access.h>
38 #include <xen/livepatch.h>
39 #include <public/sysctl.h>
40 #include <public/hvm/hvm_vcpu.h>
41 #include <asm/altp2m.h>
42 #include <asm/regs.h>
43 #include <asm/mc146818rtc.h>
44 #include <asm/system.h>
45 #include <asm/io.h>
46 #include <asm/processor.h>
47 #include <asm/desc.h>
48 #include <asm/i387.h>
49 #include <asm/xstate.h>
50 #include <asm/cpuidle.h>
51 #include <asm/mpspec.h>
52 #include <asm/ldt.h>
53 #include <asm/hvm/hvm.h>
54 #include <asm/hvm/nestedhvm.h>
55 #include <asm/hvm/support.h>
56 #include <asm/hvm/svm/svm.h>
57 #include <asm/hvm/viridian.h>
58 #include <asm/debugreg.h>
59 #include <asm/msr.h>
60 #include <asm/spec_ctrl.h>
61 #include <asm/traps.h>
62 #include <asm/nmi.h>
63 #include <asm/mce.h>
64 #include <asm/amd.h>
65 #include <xen/numa.h>
66 #include <xen/iommu.h>
67 #include <compat/vcpu.h>
68 #include <asm/psr.h>
69 #include <asm/pv/domain.h>
70 #include <asm/pv/mm.h>
71 #include <asm/spec_ctrl.h>
72 
73 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
74 
75 static void default_idle(void);
76 void (*pm_idle) (void) __read_mostly = default_idle;
77 void (*dead_idle) (void) __read_mostly = default_dead_idle;
78 
default_idle(void)79 static void default_idle(void)
80 {
81     struct cpu_info *info = get_cpu_info();
82 
83     local_irq_disable();
84     if ( cpu_is_haltable(smp_processor_id()) )
85     {
86         spec_ctrl_enter_idle(info);
87         safe_halt();
88         spec_ctrl_exit_idle(info);
89     }
90     else
91         local_irq_enable();
92 }
93 
default_dead_idle(void)94 void default_dead_idle(void)
95 {
96     /*
97      * When going into S3, without flushing caches modified data may be
98      * held by the CPUs spinning here indefinitely, and get discarded by
99      * a subsequent INIT.
100      */
101     spec_ctrl_enter_idle(get_cpu_info());
102     wbinvd();
103     halt();
104     spec_ctrl_exit_idle(get_cpu_info());
105 }
106 
play_dead(void)107 void play_dead(void)
108 {
109     unsigned int cpu = smp_processor_id();
110 
111     local_irq_disable();
112 
113     /* Change the NMI handler to a nop (see comment below). */
114     _set_gate_lower(&idt_tables[cpu][TRAP_nmi], SYS_DESC_irq_gate, 0,
115                     &trap_nop);
116 
117     /*
118      * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible,
119      * as they may be freed at any time if offline CPUs don't get parked. In
120      * this case, heap corruption or #PF can occur (when heap debugging is
121      * enabled). For example, even printk() can involve tasklet scheduling,
122      * which touches per-cpu vars.
123      *
124      * Consider very carefully when adding code to *dead_idle. Most hypervisor
125      * subsystems are unsafe to call.
126      */
127     cpu_exit_clear(cpu);
128 
129     for ( ; ; )
130         dead_idle();
131 }
132 
idle_loop(void)133 static void noreturn idle_loop(void)
134 {
135     unsigned int cpu = smp_processor_id();
136     /*
137      * Idle vcpus might be attached to non-idle units! We don't do any
138      * standard idle work like tasklets or livepatching in this case.
139      */
140     bool guest = !is_idle_domain(current->sched_unit->domain);
141 
142     for ( ; ; )
143     {
144         if ( cpu_is_offline(cpu) )
145         {
146             ASSERT(!guest);
147             play_dead();
148         }
149 
150         /* Are we here for running vcpu context tasklets, or for idling? */
151         if ( !guest && unlikely(tasklet_work_to_do(cpu)) )
152         {
153             do_tasklet();
154             /* Livepatch work is always kicked off via a tasklet. */
155             check_for_livepatch_work();
156         }
157         /*
158          * Test softirqs twice --- first to see if should even try scrubbing
159          * and then, after it is done, whether softirqs became pending
160          * while we were scrubbing.
161          */
162         else if ( !softirq_pending(cpu) && !scrub_free_pages() &&
163                   !softirq_pending(cpu) )
164         {
165             if ( guest )
166                 sched_guest_idle(pm_idle, cpu);
167             else
168                 pm_idle();
169         }
170         do_softirq();
171     }
172 }
173 
startup_cpu_idle_loop(void)174 void startup_cpu_idle_loop(void)
175 {
176     struct vcpu *v = current;
177 
178     ASSERT(is_idle_vcpu(v));
179     cpumask_set_cpu(v->processor, v->domain->dirty_cpumask);
180     write_atomic(&v->dirty_cpu, v->processor);
181 
182     reset_stack_and_jump(idle_loop);
183 }
184 
init_hypercall_page(struct domain * d,void * ptr)185 void init_hypercall_page(struct domain *d, void *ptr)
186 {
187     memset(ptr, 0xcc, PAGE_SIZE);
188 
189     if ( is_hvm_domain(d) )
190         hvm_init_hypercall_page(d, ptr);
191     else if ( is_pv_64bit_domain(d) )
192         pv_ring3_init_hypercall_page(ptr);
193     else if ( is_pv_32bit_domain(d) )
194         pv_ring1_init_hypercall_page(ptr);
195     else
196         ASSERT_UNREACHABLE();
197 }
198 
dump_pageframe_info(struct domain * d)199 void dump_pageframe_info(struct domain *d)
200 {
201     struct page_info *page;
202 
203     printk("Memory pages belonging to domain %u:\n", d->domain_id);
204 
205     if ( domain_tot_pages(d) >= 10 && d->is_dying < DOMDYING_dead )
206     {
207         printk("    DomPage list too long to display\n");
208     }
209     else
210     {
211         unsigned long total[MASK_EXTR(PGT_type_mask, PGT_type_mask) + 1] = {};
212 
213         spin_lock(&d->page_alloc_lock);
214         page_list_for_each ( page, &d->page_list )
215         {
216             unsigned int index = MASK_EXTR(page->u.inuse.type_info,
217                                            PGT_type_mask);
218 
219             if ( ++total[index] > 16 )
220             {
221                 switch ( page->u.inuse.type_info & PGT_type_mask )
222                 {
223                 case PGT_none:
224                 case PGT_writable_page:
225                     continue;
226                 }
227             }
228             printk("    DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
229                    _p(mfn_x(page_to_mfn(page))),
230                    page->count_info, page->u.inuse.type_info);
231         }
232         spin_unlock(&d->page_alloc_lock);
233     }
234 
235     if ( is_hvm_domain(d) )
236         p2m_pod_dump_data(d);
237 
238     spin_lock(&d->page_alloc_lock);
239 
240     page_list_for_each ( page, &d->xenpage_list )
241     {
242         printk("    XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
243                _p(mfn_x(page_to_mfn(page))),
244                page->count_info, page->u.inuse.type_info);
245     }
246 
247     page_list_for_each ( page, &d->extra_page_list )
248     {
249         printk("    ExtraPage %p: caf=%08lx, taf=%" PRtype_info "\n",
250                _p(mfn_x(page_to_mfn(page))),
251                page->count_info, page->u.inuse.type_info);
252     }
253 
254     spin_unlock(&d->page_alloc_lock);
255 }
256 
update_guest_memory_policy(struct vcpu * v,struct guest_memory_policy * policy)257 void update_guest_memory_policy(struct vcpu *v,
258                                 struct guest_memory_policy *policy)
259 {
260     bool old_guest_mode = nestedhvm_is_n2(v);
261     bool new_guest_mode = policy->nested_guest_mode;
262 
263     /*
264      * When 'v' is in the nested guest mode, all guest copy
265      * functions/macros which finally call paging_gva_to_gfn()
266      * transfer data to/from L2 guest. If the copy is intended for L1
267      * guest, we must first clear the nested guest flag (by setting
268      * policy->nested_guest_mode to false) before the copy and then
269      * restore the nested guest flag (by setting
270      * policy->nested_guest_mode to true) after the copy.
271      */
272     if ( unlikely(old_guest_mode != new_guest_mode) )
273     {
274         if ( new_guest_mode )
275             nestedhvm_vcpu_enter_guestmode(v);
276         else
277             nestedhvm_vcpu_exit_guestmode(v);
278         policy->nested_guest_mode = old_guest_mode;
279     }
280 }
281 
282 #ifndef CONFIG_BIGMEM
283 /*
284  * The hole may be at or above the 44-bit boundary, so we need to determine
285  * the total bit count until reaching 32 significant (not squashed out) bits
286  * in PFN representations.
287  * Note that the way "bits" gets initialized/updated/bounds-checked guarantees
288  * that the function will never return zero, and hence will never be called
289  * more than once (which is important due to it being deliberately placed in
290  * .init.text).
291  */
_domain_struct_bits(void)292 static unsigned int __init noinline _domain_struct_bits(void)
293 {
294     unsigned int bits = 32 + PAGE_SHIFT;
295     unsigned int sig = hweight32(~pfn_hole_mask);
296     unsigned int mask = pfn_hole_mask >> 32;
297 
298     for ( ; bits < BITS_PER_LONG && sig < 32; ++bits, mask >>= 1 )
299         if ( !(mask & 1) )
300             ++sig;
301 
302     return bits;
303 }
304 #endif
305 
alloc_domain_struct(void)306 struct domain *alloc_domain_struct(void)
307 {
308     struct domain *d;
309 #ifdef CONFIG_BIGMEM
310     const unsigned int bits = 0;
311 #else
312     /*
313      * We pack the PDX of the domain structure into a 32-bit field within
314      * the page_info structure. Hence the MEMF_bits() restriction.
315      */
316     static unsigned int __read_mostly bits;
317 
318     if ( unlikely(!bits) )
319          bits = _domain_struct_bits();
320 #endif
321 
322     BUILD_BUG_ON(sizeof(*d) > PAGE_SIZE);
323     d = alloc_xenheap_pages(0, MEMF_bits(bits));
324     if ( d != NULL )
325         clear_page(d);
326     return d;
327 }
328 
free_domain_struct(struct domain * d)329 void free_domain_struct(struct domain *d)
330 {
331     free_xenheap_page(d);
332 }
333 
alloc_vcpu_struct(const struct domain * d)334 struct vcpu *alloc_vcpu_struct(const struct domain *d)
335 {
336     struct vcpu *v;
337     /*
338      * This structure contains embedded PAE PDPTEs, used when an HVM guest
339      * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
340      * may require that the shadow CR3 points below 4GB, and hence the whole
341      * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
342      */
343     unsigned int memflags =
344         (is_hvm_domain(d) && paging_mode_shadow(d)) ? MEMF_bits(32) : 0;
345 
346     BUILD_BUG_ON(sizeof(*v) > PAGE_SIZE);
347     v = alloc_xenheap_pages(0, memflags);
348     if ( v != NULL )
349         clear_page(v);
350     return v;
351 }
352 
free_vcpu_struct(struct vcpu * v)353 void free_vcpu_struct(struct vcpu *v)
354 {
355     free_xenheap_page(v);
356 }
357 
358 /* Initialise various registers to their architectural INIT/RESET state. */
arch_vcpu_regs_init(struct vcpu * v)359 void arch_vcpu_regs_init(struct vcpu *v)
360 {
361     memset(&v->arch.user_regs, 0, sizeof(v->arch.user_regs));
362     v->arch.user_regs.eflags = X86_EFLAGS_MBS;
363 
364     memset(v->arch.dr, 0, sizeof(v->arch.dr));
365     v->arch.dr6 = X86_DR6_DEFAULT;
366     v->arch.dr7 = X86_DR7_DEFAULT;
367 }
368 
arch_vcpu_create(struct vcpu * v)369 int arch_vcpu_create(struct vcpu *v)
370 {
371     struct domain *d = v->domain;
372     int rc;
373 
374     v->arch.flags = TF_kernel_mode;
375 
376     rc = mapcache_vcpu_init(v);
377     if ( rc )
378         return rc;
379 
380     if ( !is_idle_domain(d) )
381     {
382         paging_vcpu_init(v);
383 
384         if ( (rc = vcpu_init_fpu(v)) != 0 )
385             return rc;
386 
387         vmce_init_vcpu(v);
388 
389         arch_vcpu_regs_init(v);
390 
391         if ( (rc = init_vcpu_msr_policy(v)) )
392             goto fail;
393     }
394     else if ( (rc = xstate_alloc_save_area(v)) != 0 )
395         return rc;
396 
397     spin_lock_init(&v->arch.vpmu.vpmu_lock);
398 
399     if ( is_hvm_domain(d) )
400         rc = hvm_vcpu_initialise(v);
401     else if ( !is_idle_domain(d) )
402         rc = pv_vcpu_initialise(v);
403     else
404     {
405         /* Idle domain */
406         v->arch.cr3 = __pa(idle_pg_table);
407         rc = 0;
408         v->arch.msrs = ZERO_BLOCK_PTR; /* Catch stray misuses */
409     }
410 
411     if ( rc )
412         goto fail;
413 
414     if ( !is_idle_domain(v->domain) )
415     {
416         vpmu_initialise(v);
417 
418         cpuid_policy_updated(v);
419     }
420 
421     return rc;
422 
423  fail:
424     vcpu_destroy_fpu(v);
425     xfree(v->arch.msrs);
426     v->arch.msrs = NULL;
427 
428     return rc;
429 }
430 
arch_vcpu_destroy(struct vcpu * v)431 void arch_vcpu_destroy(struct vcpu *v)
432 {
433     xfree(v->arch.vm_event);
434     v->arch.vm_event = NULL;
435 
436     vcpu_destroy_fpu(v);
437 
438     xfree(v->arch.msrs);
439     v->arch.msrs = NULL;
440 
441     if ( is_hvm_vcpu(v) )
442         hvm_vcpu_destroy(v);
443     else
444         pv_vcpu_destroy(v);
445 }
446 
arch_sanitise_domain_config(struct xen_domctl_createdomain * config)447 int arch_sanitise_domain_config(struct xen_domctl_createdomain *config)
448 {
449     bool hvm = config->flags & XEN_DOMCTL_CDF_hvm;
450     unsigned int max_vcpus;
451 
452     if ( hvm ? !hvm_enabled : !IS_ENABLED(CONFIG_PV) )
453     {
454         dprintk(XENLOG_INFO, "%s support not available\n", hvm ? "HVM" : "PV");
455         return -EINVAL;
456     }
457 
458     max_vcpus = hvm ? HVM_MAX_VCPUS : MAX_VIRT_CPUS;
459 
460     if ( config->max_vcpus > max_vcpus )
461     {
462         dprintk(XENLOG_INFO, "Requested vCPUs (%u) exceeds max (%u)\n",
463                 config->max_vcpus, max_vcpus);
464         return -EINVAL;
465     }
466 
467     if ( !IS_ENABLED(CONFIG_TBOOT) &&
468          (config->flags & XEN_DOMCTL_CDF_s3_integrity) )
469     {
470         dprintk(XENLOG_INFO, "S3 integrity check not valid without CONFIG_TBOOT\n");
471         return -EINVAL;
472     }
473 
474     if ( (config->flags & XEN_DOMCTL_CDF_hap) && !hvm_hap_supported() )
475     {
476         dprintk(XENLOG_INFO, "HAP requested but not supported\n");
477         return -EINVAL;
478     }
479 
480     if ( !(config->flags & XEN_DOMCTL_CDF_hvm) )
481         /*
482          * It is only meaningful for XEN_DOMCTL_CDF_oos_off to be clear
483          * for HVM guests.
484          */
485         config->flags |= XEN_DOMCTL_CDF_oos_off;
486 
487     return 0;
488 }
489 
emulation_flags_ok(const struct domain * d,uint32_t emflags)490 static bool emulation_flags_ok(const struct domain *d, uint32_t emflags)
491 {
492 #ifdef CONFIG_HVM
493     /* This doesn't catch !CONFIG_HVM case but it is better than nothing */
494     BUILD_BUG_ON(X86_EMU_ALL != XEN_X86_EMU_ALL);
495 #endif
496 
497     if ( is_hvm_domain(d) )
498     {
499         if ( is_hardware_domain(d) &&
500              emflags != (X86_EMU_VPCI | X86_EMU_LAPIC | X86_EMU_IOAPIC) )
501             return false;
502         if ( !is_hardware_domain(d) &&
503              emflags != (X86_EMU_ALL & ~X86_EMU_VPCI) &&
504              emflags != X86_EMU_LAPIC )
505             return false;
506     }
507     else if ( emflags != 0 && emflags != X86_EMU_PIT )
508     {
509         /* PV or classic PVH. */
510         return false;
511     }
512 
513     return true;
514 }
515 
arch_domain_create(struct domain * d,struct xen_domctl_createdomain * config)516 int arch_domain_create(struct domain *d,
517                        struct xen_domctl_createdomain *config)
518 {
519     bool paging_initialised = false;
520     uint32_t emflags;
521     int rc;
522 
523     INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
524 
525     spin_lock_init(&d->arch.e820_lock);
526 
527     /* Minimal initialisation for the idle domain. */
528     if ( unlikely(is_idle_domain(d)) )
529     {
530         static const struct arch_csw idle_csw = {
531             .from = paravirt_ctxt_switch_from,
532             .to   = paravirt_ctxt_switch_to,
533             .tail = idle_loop,
534         };
535 
536         d->arch.ctxt_switch = &idle_csw;
537 
538         d->arch.cpuid = ZERO_BLOCK_PTR; /* Catch stray misuses. */
539         d->arch.msr = ZERO_BLOCK_PTR;
540 
541         return 0;
542     }
543 
544     if ( !config )
545     {
546         /* Only IDLE is allowed with no config. */
547         ASSERT_UNREACHABLE();
548         return -EINVAL;
549     }
550 
551     if ( d->domain_id && cpu_has_amd_erratum(&boot_cpu_data, AMD_ERRATUM_121) )
552     {
553         if ( !opt_allow_unsafe )
554         {
555             printk(XENLOG_G_ERR "Xen does not allow DomU creation on this CPU"
556                    " for security reasons.\n");
557             return -EPERM;
558         }
559         printk(XENLOG_G_WARNING
560                "Dom%d may compromise security on this CPU.\n",
561                d->domain_id);
562     }
563 
564     emflags = config->arch.emulation_flags;
565 
566     if ( is_hardware_domain(d) && is_pv_domain(d) )
567         emflags |= XEN_X86_EMU_PIT;
568 
569     if ( emflags & ~XEN_X86_EMU_ALL )
570     {
571         printk(XENLOG_G_ERR "d%d: Invalid emulation bitmap: %#x\n",
572                d->domain_id, emflags);
573         return -EINVAL;
574     }
575 
576     if ( !emulation_flags_ok(d, emflags) )
577     {
578         printk(XENLOG_G_ERR "d%d: Xen does not allow %s domain creation "
579                "with the current selection of emulators: %#x\n",
580                d->domain_id, is_hvm_domain(d) ? "HVM" : "PV", emflags);
581         return -EOPNOTSUPP;
582     }
583     d->arch.emulation_flags = emflags;
584 
585     HYPERVISOR_COMPAT_VIRT_START(d) =
586         is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u;
587 
588     if ( (rc = paging_domain_init(d)) != 0 )
589         goto fail;
590     paging_initialised = true;
591 
592     if ( (rc = init_domain_cpuid_policy(d)) )
593         goto fail;
594 
595     if ( (rc = init_domain_msr_policy(d)) )
596         goto fail;
597 
598     d->arch.ioport_caps =
599         rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
600     rc = -ENOMEM;
601     if ( d->arch.ioport_caps == NULL )
602         goto fail;
603 
604     /*
605      * The shared_info machine address must fit in a 32-bit field within a
606      * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
607      */
608     if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
609         goto fail;
610 
611     clear_page(d->shared_info);
612     share_xen_page_with_guest(virt_to_page(d->shared_info), d, SHARE_rw);
613 
614     if ( (rc = init_domain_irq_mapping(d)) != 0 )
615         goto fail;
616 
617     if ( (rc = iommu_domain_init(d, config->iommu_opts)) != 0 )
618         goto fail;
619 
620     psr_domain_init(d);
621 
622     if ( is_hvm_domain(d) )
623     {
624         if ( (rc = hvm_domain_initialise(d)) != 0 )
625             goto fail;
626     }
627     else if ( is_pv_domain(d) )
628     {
629         mapcache_domain_init(d);
630 
631         if ( (rc = pv_domain_initialise(d)) != 0 )
632             goto fail;
633     }
634     else
635         ASSERT_UNREACHABLE(); /* Not HVM and not PV? */
636 
637     if ( (rc = tsc_set_info(d, TSC_MODE_DEFAULT, 0, 0, 0)) != 0 )
638     {
639         ASSERT_UNREACHABLE();
640         goto fail;
641     }
642 
643     /* PV/PVH guests get an emulated PIT too for video BIOSes to use. */
644     pit_init(d, cpu_khz);
645 
646     /*
647      * If the FPU does not save FCS/FDS then we can always
648      * save/restore the 64-bit FIP/FDP and ignore the selectors.
649      */
650     d->arch.x87_fip_width = cpu_has_fpu_sel ? 0 : 8;
651 
652     domain_cpu_policy_changed(d);
653 
654     return 0;
655 
656  fail:
657     d->is_dying = DOMDYING_dead;
658     psr_domain_free(d);
659     iommu_domain_destroy(d);
660     cleanup_domain_irq_mapping(d);
661     free_xenheap_page(d->shared_info);
662     xfree(d->arch.cpuid);
663     xfree(d->arch.msr);
664     if ( paging_initialised )
665         paging_final_teardown(d);
666     free_perdomain_mappings(d);
667 
668     return rc;
669 }
670 
arch_domain_destroy(struct domain * d)671 void arch_domain_destroy(struct domain *d)
672 {
673     if ( is_hvm_domain(d) )
674         hvm_domain_destroy(d);
675 
676     xfree(d->arch.e820);
677     xfree(d->arch.cpuid);
678     xfree(d->arch.msr);
679 
680     free_domain_pirqs(d);
681     if ( !is_idle_domain(d) )
682         iommu_domain_destroy(d);
683 
684     paging_final_teardown(d);
685 
686     if ( is_pv_domain(d) )
687         pv_domain_destroy(d);
688     free_perdomain_mappings(d);
689 
690     free_xenheap_page(d->shared_info);
691     cleanup_domain_irq_mapping(d);
692 
693     psr_domain_free(d);
694 }
695 
arch_domain_shutdown(struct domain * d)696 void arch_domain_shutdown(struct domain *d)
697 {
698     if ( is_viridian_domain(d) )
699         viridian_time_domain_freeze(d);
700 }
701 
arch_domain_pause(struct domain * d)702 void arch_domain_pause(struct domain *d)
703 {
704     if ( is_viridian_domain(d) )
705         viridian_time_domain_freeze(d);
706 }
707 
arch_domain_unpause(struct domain * d)708 void arch_domain_unpause(struct domain *d)
709 {
710     if ( is_viridian_domain(d) )
711         viridian_time_domain_thaw(d);
712 }
713 
arch_domain_soft_reset(struct domain * d)714 int arch_domain_soft_reset(struct domain *d)
715 {
716     struct page_info *page = virt_to_page(d->shared_info), *new_page;
717     int ret = 0;
718     struct domain *owner;
719     mfn_t mfn;
720     gfn_t gfn;
721     p2m_type_t p2mt;
722     unsigned int i;
723 
724     /* Soft reset is supported for HVM domains only. */
725     if ( !is_hvm_domain(d) )
726         return -EINVAL;
727 
728     spin_lock(&d->event_lock);
729     for ( i = 0; i < d->nr_pirqs ; i++ )
730     {
731         if ( domain_pirq_to_emuirq(d, i) != IRQ_UNBOUND )
732         {
733             ret = unmap_domain_pirq_emuirq(d, i);
734             if ( ret )
735                 break;
736         }
737     }
738     spin_unlock(&d->event_lock);
739 
740     if ( ret )
741         return ret;
742 
743     /*
744      * The shared_info page needs to be replaced with a new page, otherwise we
745      * will get a hole if the domain does XENMAPSPACE_shared_info.
746      */
747 
748     owner = page_get_owner_and_reference(page);
749     ASSERT( owner == d );
750 
751     mfn = page_to_mfn(page);
752     gfn = mfn_to_gfn(d, mfn);
753 
754     /*
755      * gfn == INVALID_GFN indicates that the shared_info page was never mapped
756      * to the domain's address space and there is nothing to replace.
757      */
758     if ( gfn_eq(gfn, INVALID_GFN) )
759         goto exit_put_page;
760 
761     if ( !mfn_eq(get_gfn_query(d, gfn_x(gfn), &p2mt), mfn) )
762     {
763         printk(XENLOG_G_ERR
764                "Failed to get %pd's shared_info GFN (%"PRI_gfn")\n",
765                d, gfn_x(gfn));
766         ret = -EINVAL;
767         goto exit_put_gfn;
768     }
769 
770     new_page = alloc_domheap_page(d, 0);
771     if ( !new_page )
772     {
773         printk(XENLOG_G_ERR
774                "Failed to alloc a page to replace %pd's shared_info GFN %"PRI_gfn"\n",
775                d, gfn_x(gfn));
776         ret = -ENOMEM;
777         goto exit_put_gfn;
778     }
779 
780     ret = guest_physmap_remove_page(d, gfn, mfn, PAGE_ORDER_4K);
781     if ( ret )
782     {
783         printk(XENLOG_G_ERR
784                "Failed to remove %pd's shared_info GFN %"PRI_gfn"\n",
785                d, gfn_x(gfn));
786         free_domheap_page(new_page);
787         goto exit_put_gfn;
788     }
789 
790     ret = guest_physmap_add_page(d, gfn, page_to_mfn(new_page),
791                                  PAGE_ORDER_4K);
792     if ( ret )
793     {
794         printk(XENLOG_G_ERR
795                "Failed to add a page to replace %pd's shared_info frame %"PRI_gfn"\n",
796                d, gfn_x(gfn));
797         free_domheap_page(new_page);
798     }
799  exit_put_gfn:
800     put_gfn(d, gfn_x(gfn));
801  exit_put_page:
802     put_page(page);
803 
804     return ret;
805 }
806 
arch_domain_creation_finished(struct domain * d)807 void arch_domain_creation_finished(struct domain *d)
808 {
809 }
810 
811 #define xen_vcpu_guest_context vcpu_guest_context
812 #define fpu_ctxt fpu_ctxt.x
813 CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt);
814 #undef fpu_ctxt
815 #undef xen_vcpu_guest_context
816 
817 /* Called by XEN_DOMCTL_setvcpucontext and VCPUOP_initialise. */
arch_set_info_guest(struct vcpu * v,vcpu_guest_context_u c)818 int arch_set_info_guest(
819     struct vcpu *v, vcpu_guest_context_u c)
820 {
821     struct domain *d = v->domain;
822     unsigned int i;
823     unsigned long flags;
824     bool compat;
825 #ifdef CONFIG_PV
826     mfn_t cr3_mfn;
827     struct page_info *cr3_page = NULL;
828     unsigned int nr_gdt_frames;
829     int rc = 0;
830 #endif
831 
832     /* The context is a compat-mode one if the target domain is compat-mode;
833      * we expect the tools to DTRT even in compat-mode callers. */
834     compat = is_pv_32bit_domain(d);
835 
836 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
837     flags = c(flags);
838 
839     if ( is_pv_domain(d) )
840     {
841         if ( !compat )
842         {
843             if ( !is_canonical_address(c.nat->user_regs.rip) ||
844                  !is_canonical_address(c.nat->user_regs.rsp) ||
845                  !is_canonical_address(c.nat->kernel_sp) ||
846                  (c.nat->ldt_ents && !is_canonical_address(c.nat->ldt_base)) ||
847                  !is_canonical_address(c.nat->fs_base) ||
848                  !is_canonical_address(c.nat->gs_base_kernel) ||
849                  !is_canonical_address(c.nat->gs_base_user) ||
850                  !is_canonical_address(c.nat->event_callback_eip) ||
851                  !is_canonical_address(c.nat->syscall_callback_eip) ||
852                  !is_canonical_address(c.nat->failsafe_callback_eip) )
853                 return -EINVAL;
854 
855             fixup_guest_stack_selector(d, c.nat->user_regs.ss);
856             fixup_guest_stack_selector(d, c.nat->kernel_ss);
857             fixup_guest_code_selector(d, c.nat->user_regs.cs);
858 
859             for ( i = 0; i < ARRAY_SIZE(c.nat->trap_ctxt); i++ )
860             {
861                 if ( !is_canonical_address(c.nat->trap_ctxt[i].address) )
862                     return -EINVAL;
863                 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
864             }
865 
866             if ( !__addr_ok(c.nat->ldt_base) )
867                 return -EINVAL;
868         }
869         else
870         {
871             fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
872             fixup_guest_stack_selector(d, c.cmp->kernel_ss);
873             fixup_guest_code_selector(d, c.cmp->user_regs.cs);
874             fixup_guest_code_selector(d, c.cmp->event_callback_cs);
875             fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
876 
877             for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); i++ )
878                 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
879         }
880 
881         /* LDT safety checks. */
882         if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) ||
883              (c(ldt_ents) > 8192) )
884             return -EINVAL;
885 
886         v->arch.pv.vgc_flags = flags;
887     }
888 
889     v->arch.flags |= TF_kernel_mode;
890     if ( unlikely(!(flags & VGCF_in_kernel)) &&
891          /*
892           * TF_kernel_mode is only allowed to be clear for 64-bit PV. See
893           * update_cr3(), sh_update_cr3(), sh_walk_guest_tables(), and
894           * shadow_one_bit_disable() for why that is.
895           */
896          !is_hvm_domain(d) && !is_pv_32bit_domain(d) )
897         v->arch.flags &= ~TF_kernel_mode;
898 
899     vcpu_setup_fpu(v, v->arch.xsave_area,
900                    flags & VGCF_I387_VALID ? &c.nat->fpu_ctxt : NULL,
901                    FCW_DEFAULT);
902 
903     if ( !compat )
904     {
905         memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs));
906         if ( is_pv_domain(d) )
907             memcpy(v->arch.pv.trap_ctxt, c.nat->trap_ctxt,
908                    sizeof(c.nat->trap_ctxt));
909     }
910     else
911     {
912         XLAT_cpu_user_regs(&v->arch.user_regs, &c.cmp->user_regs);
913         if ( is_pv_domain(d) )
914         {
915             for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i )
916                 XLAT_trap_info(v->arch.pv.trap_ctxt + i,
917                                c.cmp->trap_ctxt + i);
918         }
919     }
920 
921     if ( v->vcpu_id == 0 && (c(vm_assist) & ~arch_vm_assist_valid_mask(d)) )
922         return -EINVAL;
923 
924     if ( is_hvm_domain(d) )
925     {
926         for ( i = 0; i < ARRAY_SIZE(v->arch.dr); ++i )
927             v->arch.dr[i] = c(debugreg[i]);
928         v->arch.dr6 = c(debugreg[6]);
929         v->arch.dr7 = c(debugreg[7]);
930 
931         if ( v->vcpu_id == 0 )
932             d->vm_assist = c.nat->vm_assist;
933 
934         hvm_set_info_guest(v);
935         goto out;
936     }
937 
938 #ifdef CONFIG_PV
939     /* IOPL privileges are virtualised. */
940     v->arch.pv.iopl = v->arch.user_regs.eflags & X86_EFLAGS_IOPL;
941     v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL;
942 
943     /* Ensure real hardware interrupts are enabled. */
944     v->arch.user_regs.eflags |= X86_EFLAGS_IF;
945 
946     nr_gdt_frames = DIV_ROUND_UP(c(gdt_ents), 512);
947     if ( nr_gdt_frames > ARRAY_SIZE(v->arch.pv.gdt_frames) )
948         return -EINVAL;
949 
950     if ( !v->is_initialised )
951     {
952         if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] )
953             return -EINVAL;
954 
955         v->arch.pv.ldt_ents = c(ldt_ents);
956         v->arch.pv.ldt_base = v->arch.pv.ldt_ents
957                               ? c(ldt_base)
958                               : (unsigned long)ZERO_BLOCK_PTR;
959     }
960     else
961     {
962         unsigned long pfn = pagetable_get_pfn(v->arch.guest_table);
963         bool fail;
964 
965         if ( !compat )
966         {
967             fail = xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[3];
968             if ( pagetable_is_null(v->arch.guest_table_user) )
969                 fail |= c.nat->ctrlreg[1] || !(flags & VGCF_in_kernel);
970             else
971             {
972                 pfn = pagetable_get_pfn(v->arch.guest_table_user);
973                 fail |= xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[1];
974             }
975         } else {
976             l4_pgentry_t *l4tab = map_domain_page(_mfn(pfn));
977 
978             pfn = l4e_get_pfn(*l4tab);
979             unmap_domain_page(l4tab);
980             fail = compat_pfn_to_cr3(pfn) != c.cmp->ctrlreg[3];
981         }
982 
983         fail |= v->arch.pv.gdt_ents != c(gdt_ents);
984         for ( i = 0; !fail && i < nr_gdt_frames; ++i )
985             fail = v->arch.pv.gdt_frames[i] != c(gdt_frames[i]);
986 
987         fail |= v->arch.pv.ldt_ents != c(ldt_ents);
988         if ( v->arch.pv.ldt_ents )
989             fail |= v->arch.pv.ldt_base != c(ldt_base);
990 
991         if ( fail )
992            return -EOPNOTSUPP;
993     }
994 
995     v->arch.pv.kernel_ss = c(kernel_ss);
996     v->arch.pv.kernel_sp = c(kernel_sp);
997     for ( i = 0; i < ARRAY_SIZE(v->arch.pv.ctrlreg); ++i )
998         v->arch.pv.ctrlreg[i] = c(ctrlreg[i]);
999 
1000     v->arch.pv.event_callback_eip = c(event_callback_eip);
1001     v->arch.pv.failsafe_callback_eip = c(failsafe_callback_eip);
1002     if ( !compat )
1003     {
1004         v->arch.pv.syscall_callback_eip = c.nat->syscall_callback_eip;
1005         v->arch.pv.fs_base = c.nat->fs_base;
1006         v->arch.pv.gs_base_kernel = c.nat->gs_base_kernel;
1007         v->arch.pv.gs_base_user = c.nat->gs_base_user;
1008     }
1009     else
1010     {
1011         v->arch.pv.event_callback_cs = c(event_callback_cs);
1012         v->arch.pv.failsafe_callback_cs = c(failsafe_callback_cs);
1013     }
1014 
1015     /* Only CR0.TS is modifiable by guest or admin. */
1016     v->arch.pv.ctrlreg[0] &= X86_CR0_TS;
1017     v->arch.pv.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
1018 
1019     v->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(v, v->arch.pv.ctrlreg[4]);
1020 
1021     memset(v->arch.dr, 0, sizeof(v->arch.dr));
1022     v->arch.dr6 = X86_DR6_DEFAULT;
1023     v->arch.dr7 = X86_DR7_DEFAULT;
1024     v->arch.pv.dr7_emul = 0;
1025 
1026     for ( i = 0; i < ARRAY_SIZE(v->arch.dr); i++ )
1027         set_debugreg(v, i, c(debugreg[i]));
1028     set_debugreg(v, 6, c(debugreg[6]));
1029     set_debugreg(v, 7, c(debugreg[7]));
1030 
1031     if ( v->is_initialised )
1032         goto out;
1033 
1034     if ( v->vcpu_id == 0 )
1035     {
1036         /*
1037          * In the restore case we need to deal with L4 pages which got
1038          * initialized with m2p_strict still clear (and which hence lack the
1039          * correct initial RO_MPT_VIRT_{START,END} L4 entry).
1040          */
1041         if ( d != current->domain && !VM_ASSIST(d, m2p_strict) &&
1042              is_pv_domain(d) && !is_pv_32bit_domain(d) &&
1043              test_bit(VMASST_TYPE_m2p_strict, &c.nat->vm_assist) &&
1044              atomic_read(&d->arch.pv.nr_l4_pages) )
1045         {
1046             bool done = false;
1047 
1048             spin_lock_recursive(&d->page_alloc_lock);
1049 
1050             for ( i = 0; ; )
1051             {
1052                 struct page_info *page = page_list_remove_head(&d->page_list);
1053 
1054                 if ( page_lock(page) )
1055                 {
1056                     if ( (page->u.inuse.type_info & PGT_type_mask) ==
1057                          PGT_l4_page_table )
1058                         done = !fill_ro_mpt(page_to_mfn(page));
1059 
1060                     page_unlock(page);
1061                 }
1062 
1063                 page_list_add_tail(page, &d->page_list);
1064 
1065                 if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
1066                     break;
1067             }
1068 
1069             spin_unlock_recursive(&d->page_alloc_lock);
1070 
1071             if ( !done )
1072                 return -ERESTART;
1073         }
1074 
1075         d->vm_assist = c(vm_assist);
1076     }
1077 
1078     rc = put_old_guest_table(current);
1079     if ( rc )
1080         return rc;
1081 
1082     if ( !compat )
1083         rc = pv_set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
1084     else
1085     {
1086         unsigned long gdt_frames[ARRAY_SIZE(v->arch.pv.gdt_frames)];
1087 
1088         for ( i = 0; i < nr_gdt_frames; ++i )
1089             gdt_frames[i] = c.cmp->gdt_frames[i];
1090 
1091         rc = pv_set_gdt(v, gdt_frames, c.cmp->gdt_ents);
1092     }
1093     if ( rc != 0 )
1094         return rc;
1095 
1096     set_bit(_VPF_in_reset, &v->pause_flags);
1097 
1098     if ( !compat )
1099         cr3_mfn = _mfn(xen_cr3_to_pfn(c.nat->ctrlreg[3]));
1100     else
1101         cr3_mfn = _mfn(compat_cr3_to_pfn(c.cmp->ctrlreg[3]));
1102     cr3_page = get_page_from_mfn(cr3_mfn, d);
1103 
1104     if ( !cr3_page )
1105         rc = -EINVAL;
1106     else if ( paging_mode_refcounts(d) )
1107         /* nothing */;
1108     else if ( cr3_page == v->arch.old_guest_table )
1109     {
1110         v->arch.old_guest_table = NULL;
1111         put_page(cr3_page);
1112     }
1113     else
1114     {
1115         if ( !compat )
1116             rc = put_old_guest_table(v);
1117         if ( !rc )
1118             rc = get_page_type_preemptible(cr3_page,
1119                                            !compat ? PGT_root_page_table
1120                                                    : PGT_l3_page_table);
1121         switch ( rc )
1122         {
1123         case -EINTR:
1124             rc = -ERESTART;
1125         case -ERESTART:
1126             break;
1127         case 0:
1128             if ( !compat && !VM_ASSIST(d, m2p_strict) &&
1129                  !paging_mode_refcounts(d) )
1130                 fill_ro_mpt(cr3_mfn);
1131             break;
1132         default:
1133             if ( cr3_page == current->arch.old_guest_table )
1134                 cr3_page = NULL;
1135             break;
1136         }
1137     }
1138     if ( rc )
1139         /* handled below */;
1140     else if ( !compat )
1141     {
1142         v->arch.guest_table = pagetable_from_page(cr3_page);
1143         if ( c.nat->ctrlreg[1] )
1144         {
1145             cr3_mfn = _mfn(xen_cr3_to_pfn(c.nat->ctrlreg[1]));
1146             cr3_page = get_page_from_mfn(cr3_mfn, d);
1147 
1148             if ( !cr3_page )
1149                 rc = -EINVAL;
1150             else if ( !paging_mode_refcounts(d) )
1151             {
1152                 rc = get_page_type_preemptible(cr3_page, PGT_root_page_table);
1153                 switch ( rc )
1154                 {
1155                 case -EINTR:
1156                     rc = -ERESTART;
1157                     /* Fallthrough */
1158                 case -ERESTART:
1159                     /*
1160                      * NB that we're putting the kernel-mode table
1161                      * here, which we've already successfully
1162                      * validated above; hence partial = false;
1163                      */
1164                     v->arch.old_guest_ptpg = NULL;
1165                     v->arch.old_guest_table =
1166                         pagetable_get_page(v->arch.guest_table);
1167                     v->arch.old_guest_table_partial = false;
1168                     v->arch.guest_table = pagetable_null();
1169                     break;
1170                 default:
1171                     if ( cr3_page == current->arch.old_guest_table )
1172                         cr3_page = NULL;
1173                     break;
1174                 case 0:
1175                     if ( VM_ASSIST(d, m2p_strict) )
1176                         zap_ro_mpt(cr3_mfn);
1177                     break;
1178                 }
1179             }
1180             if ( !rc )
1181                v->arch.guest_table_user = pagetable_from_page(cr3_page);
1182         }
1183     }
1184     else
1185     {
1186         l4_pgentry_t *l4tab;
1187 
1188         l4tab = map_domain_page(pagetable_get_mfn(v->arch.guest_table));
1189         *l4tab = l4e_from_mfn(page_to_mfn(cr3_page),
1190             _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
1191         unmap_domain_page(l4tab);
1192     }
1193     if ( rc )
1194     {
1195         if ( cr3_page )
1196             put_page(cr3_page);
1197         pv_destroy_gdt(v);
1198         return rc;
1199     }
1200 
1201     clear_bit(_VPF_in_reset, &v->pause_flags);
1202 
1203     if ( v->vcpu_id == 0 )
1204         update_domain_wallclock_time(d);
1205 
1206     /* Don't redo final setup */
1207     v->is_initialised = 1;
1208 
1209     if ( paging_mode_enabled(d) )
1210         paging_update_paging_modes(v);
1211 
1212     update_cr3(v);
1213 #endif /* CONFIG_PV */
1214 
1215  out:
1216     if ( flags & VGCF_online )
1217         clear_bit(_VPF_down, &v->pause_flags);
1218     else
1219         set_bit(_VPF_down, &v->pause_flags);
1220     return 0;
1221 #undef c
1222 }
1223 
arch_initialise_vcpu(struct vcpu * v,XEN_GUEST_HANDLE_PARAM (void)arg)1224 int arch_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1225 {
1226     int rc;
1227 
1228     if ( is_hvm_vcpu(v) )
1229     {
1230         struct domain *d = v->domain;
1231         struct vcpu_hvm_context ctxt;
1232 
1233         if ( copy_from_guest(&ctxt, arg, 1) )
1234             return -EFAULT;
1235 
1236         domain_lock(d);
1237         rc = v->is_initialised ? -EEXIST : arch_set_info_hvm_guest(v, &ctxt);
1238         domain_unlock(d);
1239     }
1240     else
1241         rc = default_initialise_vcpu(v, arg);
1242 
1243     return rc;
1244 }
1245 
arch_vcpu_reset(struct vcpu * v)1246 int arch_vcpu_reset(struct vcpu *v)
1247 {
1248     v->arch.async_exception_mask = 0;
1249     memset(v->arch.async_exception_state, 0,
1250            sizeof(v->arch.async_exception_state));
1251 
1252     if ( is_pv_vcpu(v) )
1253     {
1254         pv_destroy_gdt(v);
1255         return vcpu_destroy_pagetables(v);
1256     }
1257 
1258     vcpu_end_shutdown_deferral(v);
1259     return 0;
1260 }
1261 
1262 long
arch_do_vcpu_op(int cmd,struct vcpu * v,XEN_GUEST_HANDLE_PARAM (void)arg)1263 arch_do_vcpu_op(
1264     int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1265 {
1266     long rc = 0;
1267 
1268     switch ( cmd )
1269     {
1270     case VCPUOP_send_nmi:
1271         if ( !guest_handle_is_null(arg) )
1272             return -EINVAL;
1273 
1274         if ( !test_and_set_bool(v->arch.nmi_pending) )
1275             vcpu_kick(v);
1276         break;
1277 
1278     case VCPUOP_register_vcpu_time_memory_area:
1279     {
1280         struct vcpu_register_time_memory_area area;
1281 
1282         rc = -EFAULT;
1283         if ( copy_from_guest(&area, arg, 1) )
1284             break;
1285 
1286         if ( !guest_handle_okay(area.addr.h, 1) )
1287             break;
1288 
1289         rc = 0;
1290         v->arch.time_info_guest = area.addr.h;
1291 
1292         force_update_vcpu_system_time(v);
1293 
1294         break;
1295     }
1296 
1297     case VCPUOP_get_physid:
1298     {
1299         struct vcpu_get_physid cpu_id;
1300 
1301         rc = -EINVAL;
1302         if ( !is_hwdom_pinned_vcpu(v) )
1303             break;
1304 
1305         cpu_id.phys_id =
1306             (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
1307             ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
1308 
1309         rc = -EFAULT;
1310         if ( copy_to_guest(arg, &cpu_id, 1) )
1311             break;
1312 
1313         rc = 0;
1314         break;
1315     }
1316 
1317     default:
1318         rc = -ENOSYS;
1319         break;
1320     }
1321 
1322     return rc;
1323 }
1324 
1325 /*
1326  * Notes on PV segment handling:
1327  *  - 32bit: All data from the GDT/LDT.
1328  *  - 64bit: In addition, 64bit FS/GS/GS_KERN bases.
1329  *
1330  * Linux's ABI with userspace expects to preserve the full selector and
1331  * segment base, even sel != NUL, base != GDT/LDT for 64bit code.  Xen must
1332  * honour this when context switching, to avoid breaking Linux's ABI.
1333  *
1334  * Note: It is impossible to preserve a selector value of 1, 2 or 3, as these
1335  *       get reset to 0 by an IRET back to guest context.  Code playing with
1336  *       arcane corners of x86 get to keep all resulting pieces.
1337  *
1338  * Therefore, we:
1339  *  - Load the LDT.
1340  *  - Load each segment selector.
1341  *    - Any error loads zero, and triggers a failsafe callback.
1342  *  - For 64bit, further load the 64bit bases.
1343  *
1344  * An optimisation exists on SVM-capable hardware, where we use a VMLOAD
1345  * instruction to load the LDT and full FS/GS/GS_KERN data in one go.
1346  *
1347  * AMD-like CPUs prior to Zen2 do not zero the segment base or limit when
1348  * loading a NUL selector.  This is a problem in principle when context
1349  * switching to a 64bit guest, as a NUL FS/GS segment is usable and will pick
1350  * up the stale base.
1351  *
1352  * However, it is not an issue in practice.  NUL segments are unusable for
1353  * 32bit guests (so any stale base won't be used), and we unconditionally
1354  * write the full FS/GS bases for 64bit guests.
1355  */
load_segments(struct vcpu * n)1356 static void load_segments(struct vcpu *n)
1357 {
1358     struct cpu_user_regs *uregs = &n->arch.user_regs;
1359     bool compat = is_pv_32bit_vcpu(n);
1360     bool all_segs_okay = true, fs_gs_done = false;
1361 
1362     /*
1363      * Attempt to load @seg with selector @val.  On error, clear
1364      * @all_segs_okay in function scope, and load NUL into @sel.
1365      */
1366 #define TRY_LOAD_SEG(seg, val)                          \
1367     asm volatile ( "1: mov %k[_val], %%" #seg "\n\t"    \
1368                    "2:\n\t"                             \
1369                    ".section .fixup, \"ax\"\n\t"        \
1370                    "3: xor %k[ok], %k[ok]\n\t"          \
1371                    "   mov %k[ok], %%" #seg "\n\t"      \
1372                    "   jmp 2b\n\t"                      \
1373                    ".previous\n\t"                      \
1374                    _ASM_EXTABLE(1b, 3b)                 \
1375                    : [ok] "+r" (all_segs_okay)          \
1376                    : [_val] "rm" (val) )
1377 
1378 #ifdef CONFIG_HVM
1379     if ( cpu_has_svm && !compat && (uregs->fs | uregs->gs) <= 3 )
1380     {
1381         unsigned long gsb = n->arch.flags & TF_kernel_mode
1382             ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user;
1383         unsigned long gss = n->arch.flags & TF_kernel_mode
1384             ? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel;
1385 
1386         fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
1387                                    n->arch.pv.fs_base, gsb, gss);
1388     }
1389 #endif
1390     if ( !fs_gs_done )
1391     {
1392         load_LDT(n);
1393 
1394         TRY_LOAD_SEG(fs, uregs->fs);
1395         TRY_LOAD_SEG(gs, uregs->gs);
1396     }
1397 
1398     TRY_LOAD_SEG(ds, uregs->ds);
1399     TRY_LOAD_SEG(es, uregs->es);
1400 
1401     if ( !fs_gs_done && !compat )
1402     {
1403         wrfsbase(n->arch.pv.fs_base);
1404         wrgsshadow(n->arch.pv.gs_base_kernel);
1405         wrgsbase(n->arch.pv.gs_base_user);
1406 
1407         /* If in kernel mode then switch the GS bases around. */
1408         if ( (n->arch.flags & TF_kernel_mode) )
1409             asm volatile ( "swapgs" );
1410     }
1411 
1412     if ( unlikely(!all_segs_okay) )
1413     {
1414         struct pv_vcpu *pv = &n->arch.pv;
1415         struct cpu_user_regs *regs = guest_cpu_user_regs();
1416         unsigned long *rsp =
1417             (unsigned long *)(((n->arch.flags & TF_kernel_mode)
1418                                ? regs->rsp : pv->kernel_sp) & ~0xf);
1419         unsigned long cs_and_mask, rflags;
1420 
1421         /* Fold upcall mask and architectural IOPL into RFLAGS.IF. */
1422         rflags  = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1423         rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1424         if ( VM_ASSIST(n->domain, architectural_iopl) )
1425             rflags |= n->arch.pv.iopl;
1426 
1427         if ( is_pv_32bit_vcpu(n) )
1428         {
1429             unsigned int *esp = ring_1(regs) ?
1430                                 (unsigned int *)regs->rsp :
1431                                 (unsigned int *)pv->kernel_sp;
1432             int ret = 0;
1433 
1434             /* CS longword also contains full evtchn_upcall_mask. */
1435             cs_and_mask = (unsigned short)regs->cs |
1436                 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1437 
1438             if ( !ring_1(regs) )
1439             {
1440                 ret  = put_user(regs->ss,       esp-1);
1441                 ret |= put_user(regs->esp,      esp-2);
1442                 esp -= 2;
1443             }
1444 
1445             if ( ret |
1446                  put_user(rflags,              esp-1) |
1447                  put_user(cs_and_mask,         esp-2) |
1448                  put_user(regs->eip,           esp-3) |
1449                  put_user(uregs->gs,           esp-4) |
1450                  put_user(uregs->fs,           esp-5) |
1451                  put_user(uregs->es,           esp-6) |
1452                  put_user(uregs->ds,           esp-7) )
1453             {
1454                 gprintk(XENLOG_ERR,
1455                         "error while creating compat failsafe callback frame\n");
1456                 domain_crash(n->domain);
1457             }
1458 
1459             if ( n->arch.pv.vgc_flags & VGCF_failsafe_disables_events )
1460                 vcpu_info(n, evtchn_upcall_mask) = 1;
1461 
1462             regs->entry_vector |= TRAP_syscall;
1463             regs->eflags       &= ~(X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT|
1464                                     X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1465             regs->ss            = FLAT_COMPAT_KERNEL_SS;
1466             regs->esp           = (unsigned long)(esp-7);
1467             regs->cs            = FLAT_COMPAT_KERNEL_CS;
1468             regs->eip           = pv->failsafe_callback_eip;
1469             return;
1470         }
1471 
1472         if ( !(n->arch.flags & TF_kernel_mode) )
1473             toggle_guest_mode(n);
1474         else
1475             regs->cs &= ~3;
1476 
1477         /* CS longword also contains full evtchn_upcall_mask. */
1478         cs_and_mask = (unsigned long)regs->cs |
1479             ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1480 
1481         if ( put_user(regs->ss,            rsp- 1) |
1482              put_user(regs->rsp,           rsp- 2) |
1483              put_user(rflags,              rsp- 3) |
1484              put_user(cs_and_mask,         rsp- 4) |
1485              put_user(regs->rip,           rsp- 5) |
1486              put_user(uregs->gs,           rsp- 6) |
1487              put_user(uregs->fs,           rsp- 7) |
1488              put_user(uregs->es,           rsp- 8) |
1489              put_user(uregs->ds,           rsp- 9) |
1490              put_user(regs->r11,           rsp-10) |
1491              put_user(regs->rcx,           rsp-11) )
1492         {
1493             gprintk(XENLOG_ERR,
1494                     "error while creating failsafe callback frame\n");
1495             domain_crash(n->domain);
1496         }
1497 
1498         if ( n->arch.pv.vgc_flags & VGCF_failsafe_disables_events )
1499             vcpu_info(n, evtchn_upcall_mask) = 1;
1500 
1501         regs->entry_vector |= TRAP_syscall;
1502         regs->rflags       &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1503                                 X86_EFLAGS_NT|X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1504         regs->ss            = FLAT_KERNEL_SS;
1505         regs->rsp           = (unsigned long)(rsp-11);
1506         regs->cs            = FLAT_KERNEL_CS;
1507         regs->rip           = pv->failsafe_callback_eip;
1508     }
1509 }
1510 
1511 /*
1512  * Record all guest segment state.  The guest can load segment selectors
1513  * without trapping, which will also alter the 64bit FS/GS bases.  Arbitrary
1514  * changes to bases can also be made with the WR{FS,GS}BASE instructions, when
1515  * enabled.
1516  *
1517  * Guests however cannot use SWAPGS, so there is no mechanism to modify the
1518  * inactive GS base behind Xen's back.  Therefore, Xen's copy of the inactive
1519  * GS base is still accurate, and doesn't need reading back from hardware.
1520  */
save_segments(struct vcpu * v)1521 static void save_segments(struct vcpu *v)
1522 {
1523     struct cpu_user_regs *regs = &v->arch.user_regs;
1524 
1525     regs->ds = read_sreg(ds);
1526     regs->es = read_sreg(es);
1527     regs->fs = read_sreg(fs);
1528     regs->gs = read_sreg(gs);
1529 
1530     if ( !is_pv_32bit_vcpu(v) )
1531     {
1532         unsigned long gs_base = rdgsbase();
1533 
1534         v->arch.pv.fs_base = rdfsbase();
1535         if ( v->arch.flags & TF_kernel_mode )
1536             v->arch.pv.gs_base_kernel = gs_base;
1537         else
1538             v->arch.pv.gs_base_user = gs_base;
1539     }
1540 }
1541 
paravirt_ctxt_switch_from(struct vcpu * v)1542 void paravirt_ctxt_switch_from(struct vcpu *v)
1543 {
1544     save_segments(v);
1545 
1546     /*
1547      * Disable debug breakpoints. We do this aggressively because if we switch
1548      * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1549      * inside Xen, before we get a chance to reload DR7, and this cannot always
1550      * safely be handled.
1551      */
1552     if ( unlikely(v->arch.dr7 & DR7_ACTIVE_MASK) )
1553         write_debugreg(7, 0);
1554 }
1555 
paravirt_ctxt_switch_to(struct vcpu * v)1556 void paravirt_ctxt_switch_to(struct vcpu *v)
1557 {
1558     root_pgentry_t *root_pgt = this_cpu(root_pgt);
1559 
1560     if ( root_pgt )
1561         root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] =
1562             l4e_from_page(v->domain->arch.perdomain_l3_pg,
1563                           __PAGE_HYPERVISOR_RW);
1564 
1565     if ( unlikely(v->arch.dr7 & DR7_ACTIVE_MASK) )
1566         activate_debugregs(v);
1567 
1568     if ( cpu_has_msr_tsc_aux )
1569         wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
1570 }
1571 
1572 /* Update per-VCPU guest runstate shared memory area (if registered). */
update_runstate_area(struct vcpu * v)1573 bool update_runstate_area(struct vcpu *v)
1574 {
1575     bool rc;
1576     struct guest_memory_policy policy = { .nested_guest_mode = false };
1577     void __user *guest_handle = NULL;
1578     struct vcpu_runstate_info runstate;
1579 
1580     if ( guest_handle_is_null(runstate_guest(v)) )
1581         return true;
1582 
1583     update_guest_memory_policy(v, &policy);
1584 
1585     memcpy(&runstate, &v->runstate, sizeof(runstate));
1586 
1587     if ( VM_ASSIST(v->domain, runstate_update_flag) )
1588     {
1589         guest_handle = has_32bit_shinfo(v->domain)
1590             ? &v->runstate_guest.compat.p->state_entry_time + 1
1591             : &v->runstate_guest.native.p->state_entry_time + 1;
1592         guest_handle--;
1593         runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
1594         __raw_copy_to_guest(guest_handle,
1595                             (void *)(&runstate.state_entry_time + 1) - 1, 1);
1596         smp_wmb();
1597     }
1598 
1599     if ( has_32bit_shinfo(v->domain) )
1600     {
1601         struct compat_vcpu_runstate_info info;
1602 
1603         XLAT_vcpu_runstate_info(&info, &runstate);
1604         __copy_to_guest(v->runstate_guest.compat, &info, 1);
1605         rc = true;
1606     }
1607     else
1608         rc = __copy_to_guest(runstate_guest(v), &runstate, 1) !=
1609              sizeof(runstate);
1610 
1611     if ( guest_handle )
1612     {
1613         runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
1614         smp_wmb();
1615         __raw_copy_to_guest(guest_handle,
1616                             (void *)(&runstate.state_entry_time + 1) - 1, 1);
1617     }
1618 
1619     update_guest_memory_policy(v, &policy);
1620 
1621     return rc;
1622 }
1623 
_update_runstate_area(struct vcpu * v)1624 static void _update_runstate_area(struct vcpu *v)
1625 {
1626     if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
1627          !(v->arch.flags & TF_kernel_mode) )
1628         v->arch.pv.need_update_runstate_area = 1;
1629 }
1630 
1631 /*
1632  * Overview of Xen's GDTs.
1633  *
1634  * Xen maintains per-CPU compat and regular GDTs which are both a single page
1635  * in size.  Some content is specific to each CPU (the TSS, the per-CPU marker
1636  * for #DF handling, and optionally the LDT).  The compat and regular GDTs
1637  * differ by the layout and content of the guest accessible selectors.
1638  *
1639  * The Xen selectors live from 0xe000 (slot 14 of 16), and need to always
1640  * appear in this position for interrupt/exception handling to work.
1641  *
1642  * A PV guest may specify GDT frames of their own (slots 0 to 13).  Room for a
1643  * full GDT exists in the per-domain mappings.
1644  *
1645  * To schedule a PV vcpu, we point slot 14 of the guest's full GDT at the
1646  * current CPU's compat or regular (as appropriate) GDT frame.  This is so
1647  * that the per-CPU parts still work correctly after switching pagetables and
1648  * loading the guests full GDT into GDTR.
1649  *
1650  * To schedule Idle or HVM vcpus, we load a GDT base address which causes the
1651  * regular per-CPU GDT frame to appear with selectors at the appropriate
1652  * offset.
1653  */
need_full_gdt(const struct domain * d)1654 static always_inline bool need_full_gdt(const struct domain *d)
1655 {
1656     return is_pv_domain(d) && !is_idle_domain(d);
1657 }
1658 
update_xen_slot_in_full_gdt(const struct vcpu * v,unsigned int cpu)1659 static void update_xen_slot_in_full_gdt(const struct vcpu *v, unsigned int cpu)
1660 {
1661     l1e_write(pv_gdt_ptes(v) + FIRST_RESERVED_GDT_PAGE,
1662               !is_pv_32bit_vcpu(v) ? per_cpu(gdt_l1e, cpu)
1663                                    : per_cpu(compat_gdt_l1e, cpu));
1664 }
1665 
load_full_gdt(const struct vcpu * v,unsigned int cpu)1666 static void load_full_gdt(const struct vcpu *v, unsigned int cpu)
1667 {
1668     struct desc_ptr gdt_desc = {
1669         .limit = LAST_RESERVED_GDT_BYTE,
1670         .base = GDT_VIRT_START(v),
1671     };
1672 
1673     lgdt(&gdt_desc);
1674 
1675     per_cpu(full_gdt_loaded, cpu) = true;
1676 }
1677 
load_default_gdt(unsigned int cpu)1678 static void load_default_gdt(unsigned int cpu)
1679 {
1680     struct desc_ptr gdt_desc = {
1681         .limit = LAST_RESERVED_GDT_BYTE,
1682         .base  = (unsigned long)(per_cpu(gdt, cpu) - FIRST_RESERVED_GDT_ENTRY),
1683     };
1684 
1685     lgdt(&gdt_desc);
1686 
1687     per_cpu(full_gdt_loaded, cpu) = false;
1688 }
1689 
__context_switch(void)1690 static void __context_switch(void)
1691 {
1692     struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1693     unsigned int          cpu = smp_processor_id();
1694     struct vcpu          *p = per_cpu(curr_vcpu, cpu);
1695     struct vcpu          *n = current;
1696     struct domain        *pd = p->domain, *nd = n->domain;
1697 
1698     ASSERT(p != n);
1699     ASSERT(!vcpu_cpu_dirty(n));
1700 
1701     if ( !is_idle_domain(pd) )
1702     {
1703         ASSERT(read_atomic(&p->dirty_cpu) == cpu);
1704         memcpy(&p->arch.user_regs, stack_regs, CTXT_SWITCH_STACK_BYTES);
1705         vcpu_save_fpu(p);
1706         pd->arch.ctxt_switch->from(p);
1707     }
1708 
1709     /*
1710      * Mark this CPU in next domain's dirty cpumasks before calling
1711      * ctxt_switch_to(). This avoids a race on things like EPT flushing,
1712      * which is synchronised on that function.
1713      */
1714     if ( pd != nd )
1715         cpumask_set_cpu(cpu, nd->dirty_cpumask);
1716     write_atomic(&n->dirty_cpu, cpu);
1717 
1718     if ( !is_idle_domain(nd) )
1719     {
1720         memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES);
1721         if ( cpu_has_xsave )
1722         {
1723             u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE;
1724 
1725             if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) )
1726                 BUG();
1727 
1728             if ( cpu_has_xsaves && is_hvm_vcpu(n) )
1729                 set_msr_xss(n->arch.msrs->xss.raw);
1730         }
1731         vcpu_restore_fpu_nonlazy(n, false);
1732         nd->arch.ctxt_switch->to(n);
1733     }
1734 
1735     psr_ctxt_switch_to(nd);
1736 
1737     if ( need_full_gdt(nd) )
1738         update_xen_slot_in_full_gdt(n, cpu);
1739 
1740     if ( per_cpu(full_gdt_loaded, cpu) &&
1741          ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(nd)) )
1742         load_default_gdt(cpu);
1743 
1744     write_ptbase(n);
1745 
1746 #if defined(CONFIG_PV) && defined(CONFIG_HVM)
1747     /* Prefetch the VMCB if we expect to use it later in the context switch */
1748     if ( cpu_has_svm && is_pv_domain(nd) && !is_pv_32bit_domain(nd) &&
1749          !is_idle_domain(nd) )
1750         svm_load_segs(0, 0, 0, 0, 0);
1751 #endif
1752 
1753     if ( need_full_gdt(nd) && !per_cpu(full_gdt_loaded, cpu) )
1754         load_full_gdt(n, cpu);
1755 
1756     if ( pd != nd )
1757         cpumask_clear_cpu(cpu, pd->dirty_cpumask);
1758     write_atomic(&p->dirty_cpu, VCPU_CPU_CLEAN);
1759 
1760     per_cpu(curr_vcpu, cpu) = n;
1761 }
1762 
context_switch(struct vcpu * prev,struct vcpu * next)1763 void context_switch(struct vcpu *prev, struct vcpu *next)
1764 {
1765     unsigned int cpu = smp_processor_id();
1766     const struct domain *prevd = prev->domain, *nextd = next->domain;
1767     unsigned int dirty_cpu = read_atomic(&next->dirty_cpu);
1768 
1769     ASSERT(prev != next);
1770     ASSERT(local_irq_is_enabled());
1771 
1772     get_cpu_info()->use_pv_cr3 = false;
1773     get_cpu_info()->xen_cr3 = 0;
1774 
1775     if ( unlikely(dirty_cpu != cpu) && dirty_cpu != VCPU_CPU_CLEAN )
1776     {
1777         /* Remote CPU calls __sync_local_execstate() from flush IPI handler. */
1778         flush_mask(cpumask_of(dirty_cpu), FLUSH_VCPU_STATE);
1779         ASSERT(!vcpu_cpu_dirty(next));
1780     }
1781 
1782     _update_runstate_area(prev);
1783     vpmu_switch_from(prev);
1784     np2m_schedule(NP2M_SCHEDLE_OUT);
1785 
1786     if ( is_hvm_domain(prevd) && !list_empty(&prev->arch.hvm.tm_list) )
1787         pt_save_timer(prev);
1788 
1789     local_irq_disable();
1790 
1791     set_current(next);
1792 
1793     if ( (per_cpu(curr_vcpu, cpu) == next) ||
1794          (is_idle_domain(nextd) && cpu_online(cpu)) )
1795     {
1796         local_irq_enable();
1797     }
1798     else
1799     {
1800         __context_switch();
1801 
1802         /* Re-enable interrupts before restoring state which may fault. */
1803         local_irq_enable();
1804 
1805         if ( is_pv_domain(nextd) )
1806             load_segments(next);
1807 
1808         ctxt_switch_levelling(next);
1809 
1810         if ( opt_ibpb && !is_idle_domain(nextd) )
1811         {
1812             static DEFINE_PER_CPU(unsigned int, last);
1813             unsigned int *last_id = &this_cpu(last);
1814 
1815             /*
1816              * Squash the domid and vcpu id together for comparison
1817              * efficiency.  We could in principle stash and compare the struct
1818              * vcpu pointer, but this risks a false alias if a domain has died
1819              * and the same 4k page gets reused for a new vcpu.
1820              */
1821             unsigned int next_id = (((unsigned int)nextd->domain_id << 16) |
1822                                     (uint16_t)next->vcpu_id);
1823             BUILD_BUG_ON(MAX_VIRT_CPUS > 0xffff);
1824 
1825             /*
1826              * When scheduling from a vcpu, to idle, and back to the same vcpu
1827              * (which might be common in a lightly loaded system, or when
1828              * using vcpu pinning), there is no need to issue IBPB, as we are
1829              * returning to the same security context.
1830              */
1831             if ( *last_id != next_id )
1832             {
1833                 wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB);
1834                 *last_id = next_id;
1835             }
1836         }
1837     }
1838 
1839     sched_context_switched(prev, next);
1840 
1841     _update_runstate_area(next);
1842     /* Must be done with interrupts enabled */
1843     vpmu_switch_to(next);
1844     np2m_schedule(NP2M_SCHEDLE_IN);
1845 
1846     /* Ensure that the vcpu has an up-to-date time base. */
1847     update_vcpu_system_time(next);
1848 
1849     reset_stack_and_jump_ind(nextd->arch.ctxt_switch->tail);
1850 }
1851 
continue_running(struct vcpu * same)1852 void continue_running(struct vcpu *same)
1853 {
1854     reset_stack_and_jump_ind(same->domain->arch.ctxt_switch->tail);
1855 }
1856 
__sync_local_execstate(void)1857 int __sync_local_execstate(void)
1858 {
1859     unsigned long flags;
1860     int switch_required;
1861 
1862     local_irq_save(flags);
1863 
1864     switch_required = (this_cpu(curr_vcpu) != current);
1865 
1866     if ( switch_required )
1867     {
1868         ASSERT(current == idle_vcpu[smp_processor_id()]);
1869         __context_switch();
1870     }
1871 
1872     local_irq_restore(flags);
1873 
1874     return switch_required;
1875 }
1876 
sync_local_execstate(void)1877 void sync_local_execstate(void)
1878 {
1879     (void)__sync_local_execstate();
1880 }
1881 
sync_vcpu_execstate(struct vcpu * v)1882 void sync_vcpu_execstate(struct vcpu *v)
1883 {
1884     unsigned int dirty_cpu = read_atomic(&v->dirty_cpu);
1885 
1886     if ( dirty_cpu == smp_processor_id() )
1887         sync_local_execstate();
1888     else if ( is_vcpu_dirty_cpu(dirty_cpu) )
1889     {
1890         /* Remote CPU calls __sync_local_execstate() from flush IPI handler. */
1891         flush_mask(cpumask_of(dirty_cpu), FLUSH_VCPU_STATE);
1892     }
1893     ASSERT(!is_vcpu_dirty_cpu(dirty_cpu) ||
1894            read_atomic(&v->dirty_cpu) != dirty_cpu);
1895 }
1896 
relinquish_memory(struct domain * d,struct page_list_head * list,unsigned long type)1897 static int relinquish_memory(
1898     struct domain *d, struct page_list_head *list, unsigned long type)
1899 {
1900     struct page_info  *page;
1901     unsigned long     x, y;
1902     int               ret = 0;
1903 
1904     /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1905     spin_lock_recursive(&d->page_alloc_lock);
1906 
1907     while ( (page = page_list_remove_head(list)) )
1908     {
1909         /* Grab a reference to the page so it won't disappear from under us. */
1910         if ( unlikely(!get_page(page, d)) )
1911         {
1912             /* Couldn't get a reference -- someone is freeing this page. */
1913             page_list_add_tail(page, &d->arch.relmem_list);
1914             continue;
1915         }
1916 
1917         if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1918             ret = put_page_and_type_preemptible(page);
1919         switch ( ret )
1920         {
1921         case 0:
1922             break;
1923         case -ERESTART:
1924         case -EINTR:
1925             /*
1926              * -EINTR means PGT_validated has been re-set; re-set
1927              * PGT_pinned again so that it gets picked up next time
1928              * around.
1929              *
1930              * -ERESTART, OTOH, means PGT_partial is set instead.  Put
1931              * it back on the list, but don't set PGT_pinned; the
1932              * section below will finish off de-validation.  But we do
1933              * need to drop the general ref associated with
1934              * PGT_pinned, since put_page_and_type_preemptible()
1935              * didn't do it.
1936              *
1937              * NB we can do an ASSERT for PGT_validated, since we
1938              * "own" the type ref; but theoretically, the PGT_partial
1939              * could be cleared by someone else.
1940              */
1941             if ( ret == -EINTR )
1942             {
1943                 ASSERT(page->u.inuse.type_info & PGT_validated);
1944                 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1945             }
1946             else
1947                 put_page(page);
1948 
1949             ret = -ERESTART;
1950 
1951             /* Put the page back on the list and drop the ref we grabbed above */
1952             page_list_add(page, list);
1953             put_page(page);
1954             goto out;
1955         default:
1956             BUG();
1957         }
1958 
1959         put_page_alloc_ref(page);
1960 
1961         /*
1962          * Forcibly invalidate top-most, still valid page tables at this point
1963          * to break circular 'linear page table' references as well as clean up
1964          * partially validated pages. This is okay because MMU structures are
1965          * not shared across domains and this domain is now dead. Thus top-most
1966          * valid tables are not in use so a non-zero count means circular
1967          * reference or partially validated.
1968          */
1969         y = page->u.inuse.type_info;
1970         for ( ; ; )
1971         {
1972             x = y;
1973             if ( likely((x & PGT_type_mask) != type) ||
1974                  likely(!(x & (PGT_validated|PGT_partial))) )
1975                 break;
1976 
1977             y = cmpxchg(&page->u.inuse.type_info, x,
1978                         x & ~(PGT_validated|PGT_partial));
1979             if ( likely(y == x) )
1980             {
1981                 /* No need for atomic update of type_info here: noone else updates it. */
1982                 switch ( ret = devalidate_page(page, x, 1) )
1983                 {
1984                 case 0:
1985                     break;
1986                 case -EINTR:
1987                     page_list_add(page, list);
1988                     page->u.inuse.type_info |= PGT_validated;
1989                     if ( x & PGT_partial )
1990                         put_page(page);
1991                     put_page(page);
1992                     ret = -ERESTART;
1993                     goto out;
1994                 case -ERESTART:
1995                     page_list_add(page, list);
1996                     /*
1997                      * PGT_partial holds a type ref and a general ref.
1998                      * If we came in with PGT_partial set, then we 1)
1999                      * don't need to grab an extra type count, and 2)
2000                      * do need to drop the extra page ref we grabbed
2001                      * at the top of the loop.  If we didn't come in
2002                      * with PGT_partial set, we 1) do need to drab an
2003                      * extra type count, but 2) can transfer the page
2004                      * ref we grabbed above to it.
2005                      *
2006                      * Note that we must increment type_info before
2007                      * setting PGT_partial.  Theoretically it should
2008                      * be safe to drop the page ref before setting
2009                      * PGT_partial, but do it afterwards just to be
2010                      * extra safe.
2011                      */
2012                     if ( !(x & PGT_partial) )
2013                         page->u.inuse.type_info++;
2014                     smp_wmb();
2015                     page->u.inuse.type_info |= PGT_partial;
2016                     if ( x & PGT_partial )
2017                         put_page(page);
2018                     goto out;
2019                 default:
2020                     BUG();
2021                 }
2022                 if ( x & PGT_partial )
2023                 {
2024                     page->u.inuse.type_info--;
2025                     put_page(page);
2026                 }
2027                 break;
2028             }
2029         }
2030 
2031         /* Put the page on the list and /then/ potentially free it. */
2032         page_list_add_tail(page, &d->arch.relmem_list);
2033         put_page(page);
2034 
2035         if ( hypercall_preempt_check() )
2036         {
2037             ret = -ERESTART;
2038             goto out;
2039         }
2040     }
2041 
2042     /* list is empty at this point. */
2043     page_list_move(list, &d->arch.relmem_list);
2044 
2045  out:
2046     spin_unlock_recursive(&d->page_alloc_lock);
2047     return ret;
2048 }
2049 
domain_relinquish_resources(struct domain * d)2050 int domain_relinquish_resources(struct domain *d)
2051 {
2052     int ret;
2053     struct vcpu *v;
2054 
2055     BUG_ON(!cpumask_empty(d->dirty_cpumask));
2056 
2057     /*
2058      * This hypercall can take minutes of wallclock time to complete.  This
2059      * logic implements a co-routine, stashing state in struct domain across
2060      * hypercall continuation boundaries.
2061      */
2062     switch ( d->arch.rel_priv )
2063     {
2064         /*
2065          * Record the current progress.  Subsequent hypercall continuations
2066          * will logically restart work from this point.
2067          *
2068          * PROGRESS() markers must not be in the middle of loops.  The loop
2069          * variable isn't preserved across a continuation.
2070          *
2071          * To avoid redundant work, there should be a marker before each
2072          * function which may return -ERESTART.
2073          */
2074 #define PROGRESS(x)                                                     \
2075         d->arch.rel_priv = PROG_ ## x; /* Fallthrough */ case PROG_ ## x
2076 
2077         enum {
2078             PROG_paging = 1,
2079             PROG_vcpu_pagetables,
2080             PROG_shared,
2081             PROG_xen,
2082             PROG_l4,
2083             PROG_l3,
2084             PROG_l2,
2085             PROG_done,
2086         };
2087 
2088     case 0:
2089         ret = pci_release_devices(d);
2090         if ( ret )
2091             return ret;
2092 
2093     PROGRESS(paging):
2094 
2095         /* Tear down paging-assistance stuff. */
2096         ret = paging_teardown(d);
2097         if ( ret )
2098             return ret;
2099 
2100     PROGRESS(vcpu_pagetables):
2101 
2102         /*
2103          * Drop the in-use references to page-table bases and clean
2104          * up vPMU instances.
2105          */
2106         for_each_vcpu ( d, v )
2107         {
2108             ret = vcpu_destroy_pagetables(v);
2109             if ( ret )
2110                 return ret;
2111 
2112             vpmu_destroy(v);
2113         }
2114 
2115         if ( altp2m_active(d) )
2116         {
2117             for_each_vcpu ( d, v )
2118                 altp2m_vcpu_disable_ve(v);
2119         }
2120 
2121         if ( is_pv_domain(d) )
2122         {
2123             for_each_vcpu ( d, v )
2124             {
2125                 /* Relinquish GDT/LDT mappings. */
2126                 pv_destroy_ldt(v);
2127                 pv_destroy_gdt(v);
2128             }
2129         }
2130 
2131         if ( d->arch.pirq_eoi_map != NULL )
2132         {
2133             unmap_domain_page_global(d->arch.pirq_eoi_map);
2134             put_page_and_type(mfn_to_page(_mfn(d->arch.pirq_eoi_map_mfn)));
2135             d->arch.pirq_eoi_map = NULL;
2136             d->arch.auto_unmask = 0;
2137         }
2138 
2139 #ifdef CONFIG_MEM_SHARING
2140     PROGRESS(shared):
2141 
2142         if ( is_hvm_domain(d) )
2143         {
2144             /* If the domain has shared pages, relinquish them allowing
2145              * for preemption. */
2146             ret = relinquish_shared_pages(d);
2147             if ( ret )
2148                 return ret;
2149 
2150             /*
2151              * If the domain is forked, decrement the parent's pause count
2152              * and release the domain.
2153              */
2154             if ( mem_sharing_is_fork(d) )
2155             {
2156                 struct domain *parent = d->parent;
2157 
2158                 d->parent = NULL;
2159                 domain_unpause(parent);
2160                 put_domain(parent);
2161             }
2162         }
2163 #endif
2164 
2165         spin_lock(&d->page_alloc_lock);
2166         page_list_splice(&d->arch.relmem_list, &d->page_list);
2167         INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
2168         spin_unlock(&d->page_alloc_lock);
2169 
2170     PROGRESS(xen):
2171 
2172         ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
2173         if ( ret )
2174             return ret;
2175 
2176     PROGRESS(l4):
2177 
2178         ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
2179         if ( ret )
2180             return ret;
2181 
2182     PROGRESS(l3):
2183 
2184         ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
2185         if ( ret )
2186             return ret;
2187 
2188     PROGRESS(l2):
2189 
2190         ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
2191         if ( ret )
2192             return ret;
2193 
2194     PROGRESS(done):
2195         break;
2196 
2197 #undef PROGRESS
2198 
2199     default:
2200         BUG();
2201     }
2202 
2203     pit_deinit(d);
2204 
2205     if ( is_hvm_domain(d) )
2206         hvm_domain_relinquish_resources(d);
2207 
2208     return 0;
2209 }
2210 
2211 /*
2212  * Called during vcpu construction, and each time the toolstack changes the
2213  * CPUID configuration for the domain.
2214  */
cpuid_policy_updated(struct vcpu * v)2215 void cpuid_policy_updated(struct vcpu *v)
2216 {
2217     if ( is_hvm_vcpu(v) )
2218         hvm_cpuid_policy_changed(v);
2219 }
2220 
arch_dump_domain_info(struct domain * d)2221 void arch_dump_domain_info(struct domain *d)
2222 {
2223     paging_dump_domain_info(d);
2224 }
2225 
arch_dump_vcpu_info(struct vcpu * v)2226 void arch_dump_vcpu_info(struct vcpu *v)
2227 {
2228     paging_dump_vcpu_info(v);
2229 
2230     vpmu_dump(v);
2231 }
2232 
vcpu_kick(struct vcpu * v)2233 void vcpu_kick(struct vcpu *v)
2234 {
2235     /*
2236      * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2237      * pending flag. These values may fluctuate (after all, we hold no
2238      * locks) but the key insight is that each change will cause
2239      * evtchn_upcall_pending to be polled.
2240      *
2241      * NB2. We save the running flag across the unblock to avoid a needless
2242      * IPI for domains that we IPI'd to unblock.
2243      */
2244     bool running = v->is_running;
2245 
2246     vcpu_unblock(v);
2247     if ( running && (in_irq() || (v != current)) )
2248         cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2249 }
2250 
vcpu_mark_events_pending(struct vcpu * v)2251 void vcpu_mark_events_pending(struct vcpu *v)
2252 {
2253     int already_pending = test_and_set_bit(
2254         0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2255 
2256     if ( already_pending )
2257         return;
2258 
2259     if ( is_hvm_vcpu(v) )
2260         hvm_assert_evtchn_irq(v);
2261     else
2262         vcpu_kick(v);
2263 }
2264 
vcpu_kick_softirq(void)2265 static void vcpu_kick_softirq(void)
2266 {
2267     /*
2268      * Nothing to do here: we merely prevent notifiers from racing with checks
2269      * executed on return to guest context with interrupts enabled. See, for
2270      * example, xxx_intr_assist() executed on return to HVM guest context.
2271      */
2272 }
2273 
init_vcpu_kick_softirq(void)2274 static int __init init_vcpu_kick_softirq(void)
2275 {
2276     open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2277     return 0;
2278 }
2279 __initcall(init_vcpu_kick_softirq);
2280 
domain_pause_for_debugger(void)2281 void domain_pause_for_debugger(void)
2282 {
2283 #ifdef CONFIG_CRASH_DEBUG
2284     struct vcpu *curr = current;
2285     struct domain *d = curr->domain;
2286 
2287     domain_pause_by_systemcontroller_nosync(d);
2288 
2289     /* if gdbsx active, we just need to pause the domain */
2290     if ( curr->arch.gdbsx_vcpu_event == 0 )
2291         send_global_virq(VIRQ_DEBUGGER);
2292 #endif
2293 }
2294 
2295 /*
2296  * Local variables:
2297  * mode: C
2298  * c-file-style: "BSD"
2299  * c-basic-offset: 4
2300  * tab-width: 4
2301  * indent-tabs-mode: nil
2302  * End:
2303  */
2304