1 
2 #ifndef __SCHED_H__
3 #define __SCHED_H__
4 
5 #include <xen/types.h>
6 #include <xen/spinlock.h>
7 #include <xen/rwlock.h>
8 #include <xen/shared.h>
9 #include <xen/timer.h>
10 #include <xen/rangeset.h>
11 #include <xen/domain.h>
12 #include <xen/iommu.h>
13 #include <xen/rcupdate.h>
14 #include <xen/cpumask.h>
15 #include <xen/nodemask.h>
16 #include <xen/radix-tree.h>
17 #include <xen/multicall.h>
18 #include <xen/nospec.h>
19 #include <xen/tasklet.h>
20 #include <xen/mm.h>
21 #include <xen/smp.h>
22 #include <xen/perfc.h>
23 #include <asm/atomic.h>
24 #include <xen/vpci.h>
25 #include <xen/wait.h>
26 #include <public/xen.h>
27 #include <public/domctl.h>
28 #include <public/sysctl.h>
29 #include <public/vcpu.h>
30 #include <public/event_channel.h>
31 
32 #ifdef CONFIG_COMPAT
33 #include <compat/vcpu.h>
34 DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
35 #endif
36 
37 /*
38  * Stats
39  *
40  * Enable and ease the use of scheduling related performance counters.
41  *
42  */
43 #ifdef CONFIG_PERF_COUNTERS
44 #define SCHED_STATS
45 #endif
46 
47 #define SCHED_STAT_CRANK(_X)                (perfc_incr(_X))
48 
49 /* A global pointer to the hardware domain (usually DOM0). */
50 extern struct domain *hardware_domain;
51 
52 /* A global pointer to the initial cpupool (POOL0). */
53 extern struct cpupool *cpupool0;
54 
55 #ifdef CONFIG_LATE_HWDOM
56 extern domid_t hardware_domid;
57 #else
58 #define hardware_domid 0
59 #endif
60 
61 #ifndef CONFIG_COMPAT
62 #define BITS_PER_EVTCHN_WORD(d) BITS_PER_XEN_ULONG
63 #else
64 #define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_XEN_ULONG)
65 #endif
66 
67 #define BUCKETS_PER_GROUP  (PAGE_SIZE/sizeof(struct evtchn *))
68 /* Round size of struct evtchn up to power of 2 size */
69 #define __RDU2(x)   (       (x) | (   (x) >> 1))
70 #define __RDU4(x)   ( __RDU2(x) | ( __RDU2(x) >> 2))
71 #define __RDU8(x)   ( __RDU4(x) | ( __RDU4(x) >> 4))
72 #define __RDU16(x)  ( __RDU8(x) | ( __RDU8(x) >> 8))
73 #define __RDU32(x)  (__RDU16(x) | (__RDU16(x) >>16))
74 #define next_power_of_2(x)      (__RDU32((x)-1) + 1)
75 
76 /* Maximum number of event channels for any ABI. */
77 #define MAX_NR_EVTCHNS MAX(EVTCHN_2L_NR_CHANNELS, EVTCHN_FIFO_NR_CHANNELS)
78 
79 #define EVTCHNS_PER_BUCKET (PAGE_SIZE / next_power_of_2(sizeof(struct evtchn)))
80 #define EVTCHNS_PER_GROUP  (BUCKETS_PER_GROUP * EVTCHNS_PER_BUCKET)
81 #define NR_EVTCHN_GROUPS   DIV_ROUND_UP(MAX_NR_EVTCHNS, EVTCHNS_PER_GROUP)
82 
83 #define XEN_CONSUMER_BITS 3
84 #define NR_XEN_CONSUMERS ((1 << XEN_CONSUMER_BITS) - 1)
85 
86 struct evtchn
87 {
88     rwlock_t lock;
89 #define ECS_FREE         0 /* Channel is available for use.                  */
90 #define ECS_RESERVED     1 /* Channel is reserved.                           */
91 #define ECS_UNBOUND      2 /* Channel is waiting to bind to a remote domain. */
92 #define ECS_INTERDOMAIN  3 /* Channel is bound to another domain.            */
93 #define ECS_PIRQ         4 /* Channel is bound to a physical IRQ line.       */
94 #define ECS_VIRQ         5 /* Channel is bound to a virtual IRQ line.        */
95 #define ECS_IPI          6 /* Channel is bound to a virtual IPI line.        */
96     u8  state;             /* ECS_* */
97     u8  xen_consumer:XEN_CONSUMER_BITS; /* Consumer in Xen if nonzero */
98     u8  pending:1;
99     u16 notify_vcpu_id;    /* VCPU for local delivery notification */
100     u32 port;
101     union {
102         struct {
103             domid_t remote_domid;
104         } unbound;     /* state == ECS_UNBOUND */
105         struct {
106             evtchn_port_t  remote_port;
107             struct domain *remote_dom;
108         } interdomain; /* state == ECS_INTERDOMAIN */
109         struct {
110             u32            irq;
111             evtchn_port_t  next_port;
112             evtchn_port_t  prev_port;
113         } pirq;        /* state == ECS_PIRQ */
114         u16 virq;      /* state == ECS_VIRQ */
115     } u;
116     u8 priority;
117 #ifndef NDEBUG
118     u8 old_state;      /* State when taking lock in write mode. */
119 #endif
120     u32 fifo_lastq;    /* Data for fifo events identifying last queue. */
121 #ifdef CONFIG_XSM
122     union {
123 #ifdef XSM_NEED_GENERIC_EVTCHN_SSID
124         /*
125          * If an XSM module needs more space for its event channel context,
126          * this pointer stores the necessary data for the security server.
127          */
128         void *generic;
129 #endif
130 #ifdef CONFIG_XSM_FLASK
131         /*
132          * Inlining the contents of the structure for FLASK avoids unneeded
133          * allocations, and on 64-bit platforms with only FLASK enabled,
134          * reduces the size of struct evtchn.
135          */
136         u32 flask_sid;
137 #endif
138     } ssid;
139 #endif
140 } __attribute__((aligned(64)));
141 
142 int  evtchn_init(struct domain *d, unsigned int max_port);
143 int  evtchn_destroy(struct domain *d); /* from domain_kill */
144 void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */
145 
146 struct waitqueue_vcpu;
147 
148 struct vcpu
149 {
150     int              vcpu_id;
151 
152     int              processor;
153 
154     vcpu_info_t     *vcpu_info;
155 
156     struct domain   *domain;
157 
158     struct vcpu     *next_in_list;
159 
160     spinlock_t       periodic_timer_lock;
161     s_time_t         periodic_period;
162     s_time_t         periodic_last_event;
163     struct timer     periodic_timer;
164     struct timer     singleshot_timer;
165 
166     struct timer     poll_timer;    /* timeout for SCHEDOP_poll */
167 
168     struct sched_unit *sched_unit;
169 
170     struct vcpu_runstate_info runstate;
171 #ifndef CONFIG_COMPAT
172 # define runstate_guest(v) ((v)->runstate_guest)
173     XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
174 #else
175 # define runstate_guest(v) ((v)->runstate_guest.native)
176     union {
177         XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
178         XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
179     } runstate_guest; /* guest address */
180 #endif
181     unsigned int     new_state;
182 
183     /* Has the FPU been initialised? */
184     bool             fpu_initialised;
185     /* Has the FPU been used since it was last saved? */
186     bool             fpu_dirtied;
187     /* Initialization completed for this VCPU? */
188     bool             is_initialised;
189     /* Currently running on a CPU? */
190     bool             is_running;
191     /* VCPU should wake fast (do not deep sleep the CPU). */
192     bool             is_urgent;
193     /* VCPU must context_switch without scheduling unit. */
194     bool             force_context_switch;
195     /* Require shutdown to be deferred for some asynchronous operation? */
196     bool             defer_shutdown;
197     /* VCPU is paused following shutdown request (d->is_shutting_down)? */
198     bool             paused_for_shutdown;
199     /* VCPU need affinity restored */
200     uint8_t          affinity_broken;
201 #define VCPU_AFFINITY_OVERRIDE    0x01
202 #define VCPU_AFFINITY_WAIT        0x02
203 
204     /* A hypercall has been preempted. */
205     bool             hcall_preempted;
206 #ifdef CONFIG_COMPAT
207     /* A hypercall is using the compat ABI? */
208     bool             hcall_compat;
209 #endif
210 
211     /* The CPU, if any, which is holding onto this VCPU's state. */
212 #define VCPU_CPU_CLEAN (~0u)
213     unsigned int     dirty_cpu;
214 
215     /*
216      * > 0: a single port is being polled;
217      * = 0: nothing is being polled (vcpu should be clear in d->poll_mask);
218      * < 0: multiple ports may be being polled.
219      */
220     int              poll_evtchn;
221 
222     /* (over-)protected by ->domain->event_lock */
223     int              pirq_evtchn_head;
224 
225     unsigned long    pause_flags;
226     atomic_t         pause_count;
227 
228     /* VCPU paused for vm_event replies. */
229     atomic_t         vm_event_pause_count;
230     /* VCPU paused by system controller. */
231     int              controller_pause_count;
232 
233     /* Grant table map tracking. */
234     spinlock_t       maptrack_freelist_lock;
235     unsigned int     maptrack_head;
236     unsigned int     maptrack_tail;
237 
238     /* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */
239     evtchn_port_t    virq_to_evtchn[NR_VIRQS];
240     spinlock_t       virq_lock;
241 
242     /* Tasklet for continue_hypercall_on_cpu(). */
243     struct tasklet   continue_hypercall_tasklet;
244 
245     /* Multicall information. */
246     struct mc_state  mc_state;
247 
248     struct waitqueue_vcpu *waitqueue_vcpu;
249 
250     /* Guest-specified relocation of vcpu_info. */
251     mfn_t            vcpu_info_mfn;
252 
253     struct evtchn_fifo_vcpu *evtchn_fifo;
254 
255     /* vPCI per-vCPU area, used to store data for long running operations. */
256     struct vpci_vcpu vpci;
257 
258     struct arch_vcpu arch;
259 };
260 
261 struct sched_unit {
262     struct domain         *domain;
263     struct vcpu           *vcpu_list;
264     void                  *priv;      /* scheduler private data */
265     struct sched_unit     *next_in_list;
266     struct sched_resource *res;
267     unsigned int           unit_id;
268 
269     /* Currently running on a CPU? */
270     bool                   is_running;
271     /* Does soft affinity actually play a role (given hard affinity)? */
272     bool                   soft_aff_effective;
273     /* Item has been migrated to other cpu(s). */
274     bool                   migrated;
275 
276     /* Last time unit got (de-)scheduled. */
277     uint64_t               state_entry_time;
278     /* Vcpu state summary. */
279     unsigned int           runstate_cnt[4];
280 
281     /* Bitmask of CPUs on which this VCPU may run. */
282     cpumask_var_t          cpu_hard_affinity;
283     /* Used to save affinity during temporary pinning. */
284     cpumask_var_t          cpu_hard_affinity_saved;
285     /* Bitmask of CPUs on which this VCPU prefers to run. */
286     cpumask_var_t          cpu_soft_affinity;
287 
288     /* Next unit to run. */
289     struct sched_unit      *next_task;
290     s_time_t                next_time;
291 
292     /* Number of vcpus not yet joined for context switch. */
293     unsigned int            rendezvous_in_cnt;
294 
295     /* Number of vcpus not yet finished with context switch. */
296     atomic_t                rendezvous_out_cnt;
297 };
298 
299 #define for_each_sched_unit(d, u)                                         \
300     for ( (u) = (d)->sched_unit_list; (u) != NULL; (u) = (u)->next_in_list )
301 
302 /*
303  * All vcpus of a domain are in a single linked list with unit->vcpu_list
304  * pointing to the first vcpu of the unit. The loop must be terminated when
305  * a vcpu is hit not being part of the unit to loop over.
306  */
307 #define for_each_sched_unit_vcpu(u, v)                                    \
308     for ( (v) = (u)->vcpu_list;                                           \
309           (v) != NULL && (!(u)->next_in_list ||                           \
310                           (v)->vcpu_id < (u)->next_in_list->unit_id);     \
311           (v) = (v)->next_in_list )
312 
313 /* Per-domain lock can be recursively acquired in fault handlers. */
314 #define domain_lock(d) spin_lock_recursive(&(d)->domain_lock)
315 #define domain_unlock(d) spin_unlock_recursive(&(d)->domain_lock)
316 
317 struct evtchn_port_ops;
318 
319 struct domain
320 {
321     domid_t          domain_id;
322 
323     unsigned int     max_vcpus;
324     struct vcpu    **vcpu;
325 
326     shared_info_t   *shared_info;     /* shared data area */
327 
328     rcu_read_lock_t  rcu_lock;
329 
330     spinlock_t       domain_lock;
331 
332     spinlock_t       page_alloc_lock; /* protects all the following fields  */
333     struct page_list_head page_list;  /* linked list */
334     struct page_list_head extra_page_list; /* linked list (size extra_pages) */
335     struct page_list_head xenpage_list; /* linked list (size xenheap_pages) */
336 
337     /*
338      * This field should only be directly accessed by domain_adjust_tot_pages()
339      * and the domain_tot_pages() helper function defined below.
340      */
341     unsigned int     tot_pages;
342 
343     unsigned int     xenheap_pages;     /* pages allocated from Xen heap */
344     unsigned int     outstanding_pages; /* pages claimed but not possessed */
345     unsigned int     max_pages;         /* maximum value for domain_tot_pages() */
346     unsigned int     extra_pages;       /* pages not included in domain_tot_pages() */
347     atomic_t         shr_pages;         /* shared pages */
348     atomic_t         paged_pages;       /* paged-out pages */
349 
350     /* Scheduling. */
351     void            *sched_priv;    /* scheduler-specific data */
352     struct sched_unit *sched_unit_list;
353     struct cpupool  *cpupool;
354 
355     struct domain   *next_in_list;
356     struct domain   *next_in_hashbucket;
357 
358     struct list_head rangesets;
359     spinlock_t       rangesets_lock;
360 
361     /* Event channel information. */
362     struct evtchn   *evtchn;                         /* first bucket only */
363     struct evtchn  **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
364     unsigned int     max_evtchn_port; /* max permitted port number */
365     unsigned int     valid_evtchns;   /* number of allocated event channels */
366     /*
367      * Number of in-use event channels.  Writers should use write_atomic().
368      * Readers need to use read_atomic() only when not holding event_lock.
369      */
370     unsigned int     active_evtchns;
371     /*
372      * Number of event channels used internally by Xen (not subject to
373      * EVTCHNOP_reset).  Read/write access like for active_evtchns.
374      */
375     unsigned int     xen_evtchns;
376     /* Port to resume from in evtchn_reset(), when in a continuation. */
377     unsigned int     next_evtchn;
378     spinlock_t       event_lock;
379     const struct evtchn_port_ops *evtchn_port_ops;
380     struct evtchn_fifo_domain *evtchn_fifo;
381 
382     struct grant_table *grant_table;
383 
384     /*
385      * Interrupt to event-channel mappings and other per-guest-pirq data.
386      * Protected by the domain's event-channel spinlock.
387      */
388     struct radix_tree_root pirq_tree;
389     unsigned int     nr_pirqs;
390 
391     unsigned int     options;         /* copy of createdomain flags */
392 
393     /* Is this guest dying (i.e., a zombie)? */
394     enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
395 
396     /* Domain is paused by controller software? */
397     int              controller_pause_count;
398 
399     struct {
400         int64_t seconds;
401         bool set;
402     } time_offset;
403 
404 #ifdef CONFIG_HAS_PCI
405     struct list_head pdev_list;
406 #endif
407 
408 #ifdef CONFIG_HAS_PASSTHROUGH
409     struct domain_iommu iommu;
410 #endif
411     /* is node-affinity automatically computed? */
412     bool             auto_node_affinity;
413     /* Is this guest fully privileged (aka dom0)? */
414     bool             is_privileged;
415     /* Can this guest access the Xen console? */
416     bool             is_console;
417     /* Non-migratable and non-restoreable? */
418     bool             disable_migrate;
419     /* Is this guest being debugged by dom0? */
420     bool             debugger_attached;
421     /*
422      * Set to true at the very end of domain creation, when the domain is
423      * unpaused for the first time by the systemcontroller.
424      */
425     bool             creation_finished;
426 
427     /* Which guest this guest has privileges on */
428     struct domain   *target;
429 
430     /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
431 #if MAX_VIRT_CPUS <= BITS_PER_LONG
432     DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS);
433 #else
434     unsigned long   *poll_mask;
435 #endif
436 
437     /* I/O capabilities (access to IRQs and memory-mapped I/O). */
438     struct rangeset *iomem_caps;
439     struct rangeset *irq_caps;
440 
441     /* Guest has shut down (inc. reason code)? */
442     spinlock_t       shutdown_lock;
443     bool             is_shutting_down; /* in process of shutting down? */
444     bool             is_shut_down;     /* fully shut down? */
445 #define SHUTDOWN_CODE_INVALID ~0u
446     unsigned int     shutdown_code;
447 
448     /* If this is not 0, send suspend notification here instead of
449      * raising DOM_EXC */
450     evtchn_port_t    suspend_evtchn;
451 
452     atomic_t         pause_count;
453     atomic_t         refcnt;
454 
455     unsigned long    vm_assist;
456 
457     /* Bitmask of CPUs which are holding onto this domain's state. */
458     cpumask_var_t    dirty_cpumask;
459 
460     struct arch_domain arch;
461 
462     void *ssid; /* sHype security subject identifier */
463 
464     /* Control-plane tools handle for this domain. */
465     xen_domain_handle_t handle;
466 
467     /* hvm_print_line() and guest_console_write() logging. */
468 #define DOMAIN_PBUF_SIZE 200
469     char       *pbuf;
470     unsigned    pbuf_idx;
471     spinlock_t  pbuf_lock;
472 
473     /* OProfile support. */
474     struct xenoprof *xenoprof;
475 
476     /* Domain watchdog. */
477 #define NR_DOMAIN_WATCHDOG_TIMERS 2
478     spinlock_t watchdog_lock;
479     uint32_t watchdog_inuse_map;
480     struct timer watchdog_timer[NR_DOMAIN_WATCHDOG_TIMERS];
481 
482     struct rcu_head rcu;
483 
484     /*
485      * Hypercall deadlock avoidance lock. Used if a hypercall might
486      * cause a deadlock. Acquirers don't spin waiting; they preempt.
487      */
488     spinlock_t hypercall_deadlock_mutex;
489 
490     struct lock_profile_qhead profile_head;
491 
492     /* Various vm_events */
493 
494     /* Memory sharing support */
495 #ifdef CONFIG_MEM_SHARING
496     struct vm_event_domain *vm_event_share;
497     struct domain *parent; /* VM fork parent */
498 #endif
499     /* Memory paging support */
500 #ifdef CONFIG_HAS_MEM_PAGING
501     struct vm_event_domain *vm_event_paging;
502 #endif
503     /* VM event monitor support */
504     struct vm_event_domain *vm_event_monitor;
505 
506     /*
507      * Can be specified by the user. If that is not the case, it is
508      * computed from the union of all the vcpu cpu-affinity masks.
509      */
510     nodemask_t node_affinity;
511     unsigned int last_alloc_node;
512     spinlock_t node_affinity_lock;
513 
514     /* vNUMA topology accesses are protected by rwlock. */
515     rwlock_t vnuma_rwlock;
516     struct vnuma_info *vnuma;
517 
518     /* Common monitor options */
519     struct {
520         unsigned int guest_request_enabled       : 1;
521         unsigned int guest_request_sync          : 1;
522     } monitor;
523 
524 #ifdef CONFIG_ARGO
525     /* Argo interdomain communication support */
526     struct argo_domain *argo;
527 #endif
528 };
529 
page_to_list(struct domain * d,const struct page_info * pg)530 static inline struct page_list_head *page_to_list(
531     struct domain *d, const struct page_info *pg)
532 {
533     if ( is_xen_heap_page(pg) )
534         return &d->xenpage_list;
535 
536     if ( pg->count_info & PGC_extra )
537         return &d->extra_page_list;
538 
539     return &d->page_list;
540 }
541 
542 /* Return number of pages currently posessed by the domain */
domain_tot_pages(const struct domain * d)543 static inline unsigned int domain_tot_pages(const struct domain *d)
544 {
545     ASSERT(d->extra_pages <= d->tot_pages);
546 
547     return d->tot_pages - d->extra_pages;
548 }
549 
550 /* Protect updates/reads (resp.) of domain_list and domain_hash. */
551 extern spinlock_t domlist_update_lock;
552 extern rcu_read_lock_t domlist_read_lock;
553 
554 extern struct vcpu *idle_vcpu[NR_CPUS];
555 #define is_idle_domain(d) ((d)->domain_id == DOMID_IDLE)
556 #define is_idle_vcpu(v)   (is_idle_domain((v)->domain))
557 
is_system_domain(const struct domain * d)558 static inline bool is_system_domain(const struct domain *d)
559 {
560     return d->domain_id >= DOMID_FIRST_RESERVED;
561 }
562 
563 #define DOMAIN_DESTROYED (1u << 31) /* assumes atomic_t is >= 32 bits */
564 #define put_domain(_d) \
565   if ( atomic_dec_and_test(&(_d)->refcnt) ) domain_destroy(_d)
566 
567 /*
568  * Use this when you don't have an existing reference to @d. It returns
569  * FALSE if @d is being destroyed.
570  */
get_domain(struct domain * d)571 static always_inline bool get_domain(struct domain *d)
572 {
573     int old, seen = atomic_read(&d->refcnt);
574     do
575     {
576         old = seen;
577         if ( unlikely(old & DOMAIN_DESTROYED) )
578             return false;
579         seen = atomic_cmpxchg(&d->refcnt, old, old + 1);
580     }
581     while ( unlikely(seen != old) );
582     return true;
583 }
584 
585 /*
586  * Use this when you already have, or are borrowing, a reference to @d.
587  * In this case we know that @d cannot be destroyed under our feet.
588  */
get_knownalive_domain(struct domain * d)589 static inline void get_knownalive_domain(struct domain *d)
590 {
591     atomic_inc(&d->refcnt);
592     ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED));
593 }
594 
595 int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity);
596 void domain_update_node_affinity(struct domain *d);
597 
598 /*
599  * To be implemented by each architecture, sanity checking the configuration
600  * and filling in any appropriate defaults.
601  */
602 int arch_sanitise_domain_config(struct xen_domctl_createdomain *config);
603 
604 /*
605  * Create a domain: the configuration is only necessary for real domain
606  * (domid < DOMID_FIRST_RESERVED).
607  */
608 struct domain *domain_create(domid_t domid,
609                              struct xen_domctl_createdomain *config,
610                              bool is_priv);
611 
612 /*
613  * rcu_lock_domain_by_id() is more efficient than get_domain_by_id().
614  * This is the preferred function if the returned domain reference
615  * is short lived,  but it cannot be used if the domain reference needs
616  * to be kept beyond the current scope (e.g., across a softirq).
617  * The returned domain reference must be discarded using rcu_unlock_domain().
618  */
619 struct domain *rcu_lock_domain_by_id(domid_t dom);
620 
621 /*
622  * As above function, but resolves DOMID_SELF to current domain
623  */
624 struct domain *rcu_lock_domain_by_any_id(domid_t dom);
625 
626 /*
627  * As rcu_lock_domain_by_id(), but will fail EPERM or ESRCH rather than resolve
628  * to local domain.
629  */
630 int rcu_lock_remote_domain_by_id(domid_t dom, struct domain **d);
631 
632 /*
633  * As rcu_lock_remote_domain_by_id() but will fail EINVAL if the domain is
634  * dying.
635  */
636 int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d);
637 
rcu_unlock_domain(struct domain * d)638 static inline void rcu_unlock_domain(struct domain *d)
639 {
640     if ( d != current->domain )
641         rcu_read_unlock(&d->rcu_lock);
642 }
643 
rcu_lock_domain(struct domain * d)644 static inline struct domain *rcu_lock_domain(struct domain *d)
645 {
646     if ( d != current->domain )
647         rcu_read_lock(&d->rcu_lock);
648     return d;
649 }
650 
rcu_lock_current_domain(void)651 static inline struct domain *rcu_lock_current_domain(void)
652 {
653     return /*rcu_lock_domain*/(current->domain);
654 }
655 
656 struct domain *get_domain_by_id(domid_t dom);
657 
658 struct domain *get_pg_owner(domid_t domid);
659 
put_pg_owner(struct domain * pg_owner)660 static inline void put_pg_owner(struct domain *pg_owner)
661 {
662     rcu_unlock_domain(pg_owner);
663 }
664 
665 void domain_destroy(struct domain *d);
666 int domain_kill(struct domain *d);
667 int domain_shutdown(struct domain *d, u8 reason);
668 void domain_resume(struct domain *d);
669 
670 int domain_soft_reset(struct domain *d, bool resuming);
671 
672 int vcpu_start_shutdown_deferral(struct vcpu *v);
673 void vcpu_end_shutdown_deferral(struct vcpu *v);
674 
675 /*
676  * Mark specified domain as crashed. This function always returns, even if the
677  * caller is the specified domain. The domain is not synchronously descheduled
678  * from any processor.
679  */
680 void __domain_crash(struct domain *d);
681 #define domain_crash(d) do {                                              \
682     printk("domain_crash called from %s:%d\n", __FILE__, __LINE__);       \
683     __domain_crash(d);                                                    \
684 } while (0)
685 
686 /*
687  * Called from assembly code, with an optional address to help indicate why
688  * the crash occurred.  If addr is 0, look up address from last extable
689  * redirection.
690  */
691 void noreturn asm_domain_crash_synchronous(unsigned long addr);
692 
693 void scheduler_init(void);
694 int  sched_init_vcpu(struct vcpu *v);
695 void sched_destroy_vcpu(struct vcpu *v);
696 int  sched_init_domain(struct domain *d, int poolid);
697 void sched_destroy_domain(struct domain *d);
698 long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
699 long sched_adjust_global(struct xen_sysctl_scheduler_op *);
700 int  sched_id(void);
701 void vcpu_wake(struct vcpu *v);
702 long vcpu_yield(void);
703 void vcpu_sleep_nosync(struct vcpu *v);
704 void vcpu_sleep_sync(struct vcpu *v);
705 
706 /*
707  * Force synchronisation of given VCPU's state. If it is currently descheduled,
708  * this call will ensure that all its state is committed to memory and that
709  * no CPU is using critical state (e.g., page tables) belonging to the VCPU.
710  */
711 void sync_vcpu_execstate(struct vcpu *v);
712 
713 /* As above, for any lazy state being held on the local CPU. */
714 void sync_local_execstate(void);
715 
716 /*
717  * Called by the scheduler to switch to another VCPU. This function must
718  * call sched_context_switched(@prev, @next) when the local CPU is no longer
719  * running in @prev's context, and that context is saved to memory.
720  * Alternatively, if implementing lazy context switching, it suffices to ensure
721  * that invoking sync_vcpu_execstate() will switch and commit @prev's state.
722  */
723 void context_switch(
724     struct vcpu *prev,
725     struct vcpu *next);
726 
727 /*
728  * As described above, context_switch() must call this function when the
729  * local CPU is no longer running in @prev's context, and @prev's context is
730  * saved to memory. Alternatively, if implementing lazy context switching,
731  * ensure that invoking sync_vcpu_execstate() will switch and commit @prev.
732  */
733 void sched_context_switched(struct vcpu *prev, struct vcpu *vnext);
734 
735 /* Called by the scheduler to continue running the current VCPU. */
736 void continue_running(
737     struct vcpu *same);
738 
739 void startup_cpu_idle_loop(void);
740 extern void (*pm_idle) (void);
741 extern void (*dead_idle) (void);
742 
743 
744 /*
745  * Creates a continuation to resume the current hypercall. The caller should
746  * return immediately, propagating the value returned from this invocation.
747  * The format string specifies the types and number of hypercall arguments.
748  * It contains one character per argument as follows:
749  *  'i' [unsigned] {char, int}
750  *  'l' [unsigned] long
751  *  'h' guest handle (XEN_GUEST_HANDLE(foo))
752  */
753 unsigned long hypercall_create_continuation(
754     unsigned int op, const char *format, ...);
755 
hypercall_cancel_continuation(struct vcpu * v)756 static inline void hypercall_cancel_continuation(struct vcpu *v)
757 {
758     v->hcall_preempted = false;
759 }
760 
761 /*
762  * For long-running operations that must be in hypercall context, check
763  * if there is background work to be done that should interrupt this
764  * operation.
765  */
766 #define hypercall_preempt_check() (unlikely(    \
767         softirq_pending(smp_processor_id()) |   \
768         local_events_need_delivery()            \
769     ))
770 
771 /*
772  * For long-running operations that may be in hypercall context or on
773  * the idle vcpu (e.g. during dom0 construction), check if there is
774  * background work to be done that should interrupt this operation.
775  */
776 #define general_preempt_check() (unlikely(                          \
777         softirq_pending(smp_processor_id()) ||                      \
778         (!is_idle_vcpu(current) && local_events_need_delivery())    \
779     ))
780 
781 extern struct domain *domain_list;
782 
783 /* Caller must hold the domlist_read_lock or domlist_update_lock. */
first_domain_in_cpupool(const struct cpupool * c)784 static inline struct domain *first_domain_in_cpupool(const struct cpupool *c)
785 {
786     struct domain *d;
787     for (d = rcu_dereference(domain_list); d && d->cpupool != c;
788          d = rcu_dereference(d->next_in_list));
789     return d;
790 }
next_domain_in_cpupool(struct domain * d,const struct cpupool * c)791 static inline struct domain *next_domain_in_cpupool(
792     struct domain *d, const struct cpupool *c)
793 {
794     for (d = rcu_dereference(d->next_in_list); d && d->cpupool != c;
795          d = rcu_dereference(d->next_in_list));
796     return d;
797 }
798 
799 #define for_each_domain(_d)                     \
800  for ( (_d) = rcu_dereference(domain_list);     \
801        (_d) != NULL;                            \
802        (_d) = rcu_dereference((_d)->next_in_list )) \
803 
804 #define for_each_domain_in_cpupool(_d,_c)       \
805  for ( (_d) = first_domain_in_cpupool(_c);      \
806        (_d) != NULL;                            \
807        (_d) = next_domain_in_cpupool((_d), (_c)))
808 
809 #define for_each_vcpu(_d,_v)                    \
810  for ( (_v) = (_d)->vcpu ? (_d)->vcpu[0] : NULL; \
811        (_v) != NULL;                            \
812        (_v) = (_v)->next_in_list )
813 
814 /*
815  * Per-VCPU pause flags.
816  */
817  /* Domain is blocked waiting for an event. */
818 #define _VPF_blocked         0
819 #define VPF_blocked          (1UL<<_VPF_blocked)
820  /* VCPU is offline. */
821 #define _VPF_down            1
822 #define VPF_down             (1UL<<_VPF_down)
823  /* VCPU is blocked awaiting an event to be consumed by Xen. */
824 #define _VPF_blocked_in_xen  2
825 #define VPF_blocked_in_xen   (1UL<<_VPF_blocked_in_xen)
826  /* VCPU affinity has changed: migrating to a new CPU. */
827 #define _VPF_migrating       3
828 #define VPF_migrating        (1UL<<_VPF_migrating)
829  /* VCPU is blocked due to missing mem_paging ring. */
830 #define _VPF_mem_paging      4
831 #define VPF_mem_paging       (1UL<<_VPF_mem_paging)
832  /* VCPU is blocked due to missing mem_access ring. */
833 #define _VPF_mem_access      5
834 #define VPF_mem_access       (1UL<<_VPF_mem_access)
835  /* VCPU is blocked due to missing mem_sharing ring. */
836 #define _VPF_mem_sharing     6
837 #define VPF_mem_sharing      (1UL<<_VPF_mem_sharing)
838  /* VCPU is being reset. */
839 #define _VPF_in_reset        7
840 #define VPF_in_reset         (1UL<<_VPF_in_reset)
841 /* VCPU is parked. */
842 #define _VPF_parked          8
843 #define VPF_parked           (1UL<<_VPF_parked)
844 
vcpu_runnable(const struct vcpu * v)845 static inline bool vcpu_runnable(const struct vcpu *v)
846 {
847     return !(v->pause_flags |
848              atomic_read(&v->pause_count) |
849              atomic_read(&v->domain->pause_count));
850 }
851 
is_vcpu_dirty_cpu(unsigned int cpu)852 static inline bool is_vcpu_dirty_cpu(unsigned int cpu)
853 {
854     BUILD_BUG_ON(NR_CPUS >= VCPU_CPU_CLEAN);
855     return cpu != VCPU_CPU_CLEAN;
856 }
857 
vcpu_cpu_dirty(const struct vcpu * v)858 static inline bool vcpu_cpu_dirty(const struct vcpu *v)
859 {
860     return is_vcpu_dirty_cpu(read_atomic(&v->dirty_cpu));
861 }
862 
863 void vcpu_block(void);
864 void vcpu_unblock(struct vcpu *v);
865 void vcpu_pause(struct vcpu *v);
866 void vcpu_pause_nosync(struct vcpu *v);
867 void vcpu_unpause(struct vcpu *v);
868 int vcpu_pause_by_systemcontroller(struct vcpu *v);
869 int vcpu_unpause_by_systemcontroller(struct vcpu *v);
870 
871 void domain_pause(struct domain *d);
872 void domain_pause_nosync(struct domain *d);
873 void domain_unpause(struct domain *d);
874 int domain_unpause_by_systemcontroller(struct domain *d);
875 int __domain_pause_by_systemcontroller(struct domain *d,
876                                        void (*pause_fn)(struct domain *d));
domain_pause_by_systemcontroller(struct domain * d)877 static inline int domain_pause_by_systemcontroller(struct domain *d)
878 {
879     return __domain_pause_by_systemcontroller(d, domain_pause);
880 }
domain_pause_by_systemcontroller_nosync(struct domain * d)881 static inline int domain_pause_by_systemcontroller_nosync(struct domain *d)
882 {
883     return __domain_pause_by_systemcontroller(d, domain_pause_nosync);
884 }
885 
886 /* domain_pause() but safe against trying to pause current. */
887 int __must_check domain_pause_except_self(struct domain *d);
888 void domain_unpause_except_self(struct domain *d);
889 
890 /*
891  * For each allocated vcpu, d->vcpu[X]->vcpu_id == X
892  *
893  * During construction, all vcpus in d->vcpu[] are allocated sequentially, and
894  * in ascending order.  Therefore, if d->vcpu[N] exists (e.g. derived from
895  * current), all vcpus with an id less than N also exist.
896  *
897  * SMP considerations: The idle domain is constructed before APs are started.
898  * All other domains have d->vcpu[] allocated and d->max_vcpus set before the
899  * domain is made visible in the domlist, which is serialised on the global
900  * domlist_update_lock.
901  *
902  * Therefore, all observations of d->max_vcpus vs d->vcpu[] will be consistent
903  * despite the lack of smp_* barriers, either by being on the same CPU as the
904  * one which issued the writes, or because of barrier properties of the domain
905  * having been inserted into the domlist.
906  */
domain_vcpu(const struct domain * d,unsigned int vcpu_id)907 static inline struct vcpu *domain_vcpu(const struct domain *d,
908                                        unsigned int vcpu_id)
909 {
910     unsigned int idx = array_index_nospec(vcpu_id, d->max_vcpus);
911 
912     return vcpu_id >= d->max_vcpus ? NULL : d->vcpu[idx];
913 }
914 
915 void cpu_init(void);
916 
917 /*
918  * vcpu is urgent if vcpu is polling event channel
919  *
920  * if urgent vcpu exists, CPU should not enter deep C state
921  */
922 DECLARE_PER_CPU(atomic_t, sched_urgent_count);
sched_has_urgent_vcpu(void)923 static inline bool sched_has_urgent_vcpu(void)
924 {
925     return atomic_read(&this_cpu(sched_urgent_count));
926 }
927 
928 void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value);
929 void sched_setup_dom0_vcpus(struct domain *d);
930 int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason);
931 int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity);
932 void restore_vcpu_affinity(struct domain *d);
933 int vcpu_affinity_domctl(struct domain *d, uint32_t cmd,
934                          struct xen_domctl_vcpuaffinity *vcpuaff);
935 
936 void vcpu_runstate_get(const struct vcpu *v,
937                        struct vcpu_runstate_info *runstate);
938 uint64_t get_cpu_idle_time(unsigned int cpu);
939 void sched_guest_idle(void (*idle) (void), unsigned int cpu);
940 void scheduler_enable(void);
941 void scheduler_disable(void);
942 
943 /*
944  * Used by idle loop to decide whether there is work to do:
945  *  (1) Deal with RCU; (2) or run softirqs; or (3) Play dead;
946  *  or (4) Run tasklets.
947  *
948  * About (3), if a tasklet is enqueued, it will be scheduled
949  * really really soon, and hence it's pointless to try to
950  * sleep between these two events (that's why we don't call
951  * the tasklet_work_to_do() helper).
952  */
953 #define cpu_is_haltable(cpu)                    \
954     (!rcu_needs_cpu(cpu) &&                     \
955      !softirq_pending(cpu) &&                   \
956      cpu_online(cpu) &&                         \
957      !per_cpu(tasklet_work_to_do, cpu))
958 
959 void watchdog_domain_init(struct domain *d);
960 void watchdog_domain_destroy(struct domain *d);
961 
962 /*
963  * Use this check when the following are both true:
964  *  - Using this feature or interface requires full access to the hardware
965  *    (that is, this would not be suitable for a driver domain)
966  *  - There is never a reason to deny the hardware domain access to this
967  */
is_hardware_domain(const struct domain * d)968 static always_inline bool is_hardware_domain(const struct domain *d)
969 {
970     if ( IS_ENABLED(CONFIG_PV_SHIM_EXCLUSIVE) )
971         return false;
972 
973     return evaluate_nospec(d == hardware_domain);
974 }
975 
976 /* This check is for functionality specific to a control domain */
is_control_domain(const struct domain * d)977 static always_inline bool is_control_domain(const struct domain *d)
978 {
979     if ( IS_ENABLED(CONFIG_PV_SHIM_EXCLUSIVE) )
980         return false;
981 
982     return evaluate_nospec(d->is_privileged);
983 }
984 
985 #define VM_ASSIST(d, t) (test_bit(VMASST_TYPE_ ## t, &(d)->vm_assist))
986 
is_pv_domain(const struct domain * d)987 static always_inline bool is_pv_domain(const struct domain *d)
988 {
989     return IS_ENABLED(CONFIG_PV) &&
990         evaluate_nospec(!(d->options & XEN_DOMCTL_CDF_hvm));
991 }
992 
is_pv_vcpu(const struct vcpu * v)993 static always_inline bool is_pv_vcpu(const struct vcpu *v)
994 {
995     return is_pv_domain(v->domain);
996 }
997 
998 #ifdef CONFIG_COMPAT
is_pv_32bit_domain(const struct domain * d)999 static always_inline bool is_pv_32bit_domain(const struct domain *d)
1000 {
1001 #ifdef CONFIG_PV32
1002     return is_pv_domain(d) && d->arch.pv.is_32bit;
1003 #else
1004     return false;
1005 #endif
1006 }
1007 
is_pv_32bit_vcpu(const struct vcpu * v)1008 static always_inline bool is_pv_32bit_vcpu(const struct vcpu *v)
1009 {
1010     return is_pv_32bit_domain(v->domain);
1011 }
1012 
is_pv_64bit_domain(const struct domain * d)1013 static always_inline bool is_pv_64bit_domain(const struct domain *d)
1014 {
1015     if ( !is_pv_domain(d) )
1016         return false;
1017 
1018 #ifdef CONFIG_PV32
1019     return !d->arch.pv.is_32bit;
1020 #else
1021     return true;
1022 #endif
1023 }
1024 
is_pv_64bit_vcpu(const struct vcpu * v)1025 static always_inline bool is_pv_64bit_vcpu(const struct vcpu *v)
1026 {
1027     return is_pv_64bit_domain(v->domain);
1028 }
1029 #endif
is_hvm_domain(const struct domain * d)1030 static always_inline bool is_hvm_domain(const struct domain *d)
1031 {
1032     return IS_ENABLED(CONFIG_HVM) &&
1033         evaluate_nospec(d->options & XEN_DOMCTL_CDF_hvm);
1034 }
1035 
is_hvm_vcpu(const struct vcpu * v)1036 static always_inline bool is_hvm_vcpu(const struct vcpu *v)
1037 {
1038     return is_hvm_domain(v->domain);
1039 }
1040 
hap_enabled(const struct domain * d)1041 static always_inline bool hap_enabled(const struct domain *d)
1042 {
1043     /* sanitise_domain_config() rejects HAP && !HVM */
1044     return IS_ENABLED(CONFIG_HVM) &&
1045         evaluate_nospec(d->options & XEN_DOMCTL_CDF_hap);
1046 }
1047 
is_hwdom_pinned_vcpu(const struct vcpu * v)1048 static inline bool is_hwdom_pinned_vcpu(const struct vcpu *v)
1049 {
1050     return (is_hardware_domain(v->domain) &&
1051             cpumask_weight(v->sched_unit->cpu_hard_affinity) == 1);
1052 }
1053 
is_vcpu_online(const struct vcpu * v)1054 static inline bool is_vcpu_online(const struct vcpu *v)
1055 {
1056     return !test_bit(_VPF_down, &v->pause_flags);
1057 }
1058 
is_xenstore_domain(const struct domain * d)1059 static inline bool is_xenstore_domain(const struct domain *d)
1060 {
1061     return d->options & XEN_DOMCTL_CDF_xs_domain;
1062 }
1063 
is_iommu_enabled(const struct domain * d)1064 static always_inline bool is_iommu_enabled(const struct domain *d)
1065 {
1066     return evaluate_nospec(d->options & XEN_DOMCTL_CDF_iommu);
1067 }
1068 
1069 extern bool sched_smt_power_savings;
1070 extern bool sched_disable_smt_switching;
1071 
1072 extern enum cpufreq_controller {
1073     FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen
1074 } cpufreq_controller;
1075 
is_cpufreq_controller(const struct domain * d)1076 static always_inline bool is_cpufreq_controller(const struct domain *d)
1077 {
1078     /*
1079      * A PV dom0 can be nominated as the cpufreq controller, instead of using
1080      * Xen's cpufreq driver, at which point dom0 gets direct access to certain
1081      * MSRs.
1082      *
1083      * This interface only works when dom0 is identity pinned and has the same
1084      * number of vCPUs as pCPUs on the system.
1085      *
1086      * It would be far better to paravirtualise the interface.
1087      */
1088     return (is_pv_domain(d) && is_hardware_domain(d) &&
1089             cpufreq_controller == FREQCTL_dom0_kernel);
1090 }
1091 
1092 int cpupool_move_domain(struct domain *d, struct cpupool *c);
1093 int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op);
1094 int cpupool_get_id(const struct domain *d);
1095 const cpumask_t *cpupool_valid_cpus(const struct cpupool *pool);
1096 extern void dump_runq(unsigned char key);
1097 
1098 void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
1099 
1100 #endif /* __SCHED_H__ */
1101 
1102 /*
1103  * Local variables:
1104  * mode: C
1105  * c-file-style: "BSD"
1106  * c-basic-offset: 4
1107  * tab-width: 4
1108  * indent-tabs-mode: nil
1109  * End:
1110  */
1111