1
2 #ifndef __SCHED_H__
3 #define __SCHED_H__
4
5 #include <xen/types.h>
6 #include <xen/spinlock.h>
7 #include <xen/rwlock.h>
8 #include <xen/shared.h>
9 #include <xen/timer.h>
10 #include <xen/rangeset.h>
11 #include <xen/domain.h>
12 #include <xen/iommu.h>
13 #include <xen/rcupdate.h>
14 #include <xen/cpumask.h>
15 #include <xen/nodemask.h>
16 #include <xen/radix-tree.h>
17 #include <xen/multicall.h>
18 #include <xen/nospec.h>
19 #include <xen/tasklet.h>
20 #include <xen/mm.h>
21 #include <xen/smp.h>
22 #include <xen/perfc.h>
23 #include <asm/atomic.h>
24 #include <xen/vpci.h>
25 #include <xen/wait.h>
26 #include <public/xen.h>
27 #include <public/domctl.h>
28 #include <public/sysctl.h>
29 #include <public/vcpu.h>
30 #include <public/event_channel.h>
31
32 #ifdef CONFIG_COMPAT
33 #include <compat/vcpu.h>
34 DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
35 #endif
36
37 /*
38 * Stats
39 *
40 * Enable and ease the use of scheduling related performance counters.
41 *
42 */
43 #ifdef CONFIG_PERF_COUNTERS
44 #define SCHED_STATS
45 #endif
46
47 #define SCHED_STAT_CRANK(_X) (perfc_incr(_X))
48
49 /* A global pointer to the hardware domain (usually DOM0). */
50 extern struct domain *hardware_domain;
51
52 /* A global pointer to the initial cpupool (POOL0). */
53 extern struct cpupool *cpupool0;
54
55 #ifdef CONFIG_LATE_HWDOM
56 extern domid_t hardware_domid;
57 #else
58 #define hardware_domid 0
59 #endif
60
61 #ifndef CONFIG_COMPAT
62 #define BITS_PER_EVTCHN_WORD(d) BITS_PER_XEN_ULONG
63 #else
64 #define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_XEN_ULONG)
65 #endif
66
67 #define BUCKETS_PER_GROUP (PAGE_SIZE/sizeof(struct evtchn *))
68 /* Round size of struct evtchn up to power of 2 size */
69 #define __RDU2(x) ( (x) | ( (x) >> 1))
70 #define __RDU4(x) ( __RDU2(x) | ( __RDU2(x) >> 2))
71 #define __RDU8(x) ( __RDU4(x) | ( __RDU4(x) >> 4))
72 #define __RDU16(x) ( __RDU8(x) | ( __RDU8(x) >> 8))
73 #define __RDU32(x) (__RDU16(x) | (__RDU16(x) >>16))
74 #define next_power_of_2(x) (__RDU32((x)-1) + 1)
75
76 /* Maximum number of event channels for any ABI. */
77 #define MAX_NR_EVTCHNS MAX(EVTCHN_2L_NR_CHANNELS, EVTCHN_FIFO_NR_CHANNELS)
78
79 #define EVTCHNS_PER_BUCKET (PAGE_SIZE / next_power_of_2(sizeof(struct evtchn)))
80 #define EVTCHNS_PER_GROUP (BUCKETS_PER_GROUP * EVTCHNS_PER_BUCKET)
81 #define NR_EVTCHN_GROUPS DIV_ROUND_UP(MAX_NR_EVTCHNS, EVTCHNS_PER_GROUP)
82
83 #define XEN_CONSUMER_BITS 3
84 #define NR_XEN_CONSUMERS ((1 << XEN_CONSUMER_BITS) - 1)
85
86 struct evtchn
87 {
88 rwlock_t lock;
89 #define ECS_FREE 0 /* Channel is available for use. */
90 #define ECS_RESERVED 1 /* Channel is reserved. */
91 #define ECS_UNBOUND 2 /* Channel is waiting to bind to a remote domain. */
92 #define ECS_INTERDOMAIN 3 /* Channel is bound to another domain. */
93 #define ECS_PIRQ 4 /* Channel is bound to a physical IRQ line. */
94 #define ECS_VIRQ 5 /* Channel is bound to a virtual IRQ line. */
95 #define ECS_IPI 6 /* Channel is bound to a virtual IPI line. */
96 u8 state; /* ECS_* */
97 u8 xen_consumer:XEN_CONSUMER_BITS; /* Consumer in Xen if nonzero */
98 u8 pending:1;
99 u16 notify_vcpu_id; /* VCPU for local delivery notification */
100 u32 port;
101 union {
102 struct {
103 domid_t remote_domid;
104 } unbound; /* state == ECS_UNBOUND */
105 struct {
106 evtchn_port_t remote_port;
107 struct domain *remote_dom;
108 } interdomain; /* state == ECS_INTERDOMAIN */
109 struct {
110 u32 irq;
111 evtchn_port_t next_port;
112 evtchn_port_t prev_port;
113 } pirq; /* state == ECS_PIRQ */
114 u16 virq; /* state == ECS_VIRQ */
115 } u;
116 u8 priority;
117 #ifndef NDEBUG
118 u8 old_state; /* State when taking lock in write mode. */
119 #endif
120 u32 fifo_lastq; /* Data for fifo events identifying last queue. */
121 #ifdef CONFIG_XSM
122 union {
123 #ifdef XSM_NEED_GENERIC_EVTCHN_SSID
124 /*
125 * If an XSM module needs more space for its event channel context,
126 * this pointer stores the necessary data for the security server.
127 */
128 void *generic;
129 #endif
130 #ifdef CONFIG_XSM_FLASK
131 /*
132 * Inlining the contents of the structure for FLASK avoids unneeded
133 * allocations, and on 64-bit platforms with only FLASK enabled,
134 * reduces the size of struct evtchn.
135 */
136 u32 flask_sid;
137 #endif
138 } ssid;
139 #endif
140 } __attribute__((aligned(64)));
141
142 int evtchn_init(struct domain *d, unsigned int max_port);
143 int evtchn_destroy(struct domain *d); /* from domain_kill */
144 void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */
145
146 struct waitqueue_vcpu;
147
148 struct vcpu
149 {
150 int vcpu_id;
151
152 int processor;
153
154 vcpu_info_t *vcpu_info;
155
156 struct domain *domain;
157
158 struct vcpu *next_in_list;
159
160 spinlock_t periodic_timer_lock;
161 s_time_t periodic_period;
162 s_time_t periodic_last_event;
163 struct timer periodic_timer;
164 struct timer singleshot_timer;
165
166 struct timer poll_timer; /* timeout for SCHEDOP_poll */
167
168 struct sched_unit *sched_unit;
169
170 struct vcpu_runstate_info runstate;
171 #ifndef CONFIG_COMPAT
172 # define runstate_guest(v) ((v)->runstate_guest)
173 XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
174 #else
175 # define runstate_guest(v) ((v)->runstate_guest.native)
176 union {
177 XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
178 XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
179 } runstate_guest; /* guest address */
180 #endif
181 unsigned int new_state;
182
183 /* Has the FPU been initialised? */
184 bool fpu_initialised;
185 /* Has the FPU been used since it was last saved? */
186 bool fpu_dirtied;
187 /* Initialization completed for this VCPU? */
188 bool is_initialised;
189 /* Currently running on a CPU? */
190 bool is_running;
191 /* VCPU should wake fast (do not deep sleep the CPU). */
192 bool is_urgent;
193 /* VCPU must context_switch without scheduling unit. */
194 bool force_context_switch;
195 /* Require shutdown to be deferred for some asynchronous operation? */
196 bool defer_shutdown;
197 /* VCPU is paused following shutdown request (d->is_shutting_down)? */
198 bool paused_for_shutdown;
199 /* VCPU need affinity restored */
200 uint8_t affinity_broken;
201 #define VCPU_AFFINITY_OVERRIDE 0x01
202 #define VCPU_AFFINITY_WAIT 0x02
203
204 /* A hypercall has been preempted. */
205 bool hcall_preempted;
206 #ifdef CONFIG_COMPAT
207 /* A hypercall is using the compat ABI? */
208 bool hcall_compat;
209 #endif
210
211 /* The CPU, if any, which is holding onto this VCPU's state. */
212 #define VCPU_CPU_CLEAN (~0u)
213 unsigned int dirty_cpu;
214
215 /*
216 * > 0: a single port is being polled;
217 * = 0: nothing is being polled (vcpu should be clear in d->poll_mask);
218 * < 0: multiple ports may be being polled.
219 */
220 int poll_evtchn;
221
222 /* (over-)protected by ->domain->event_lock */
223 int pirq_evtchn_head;
224
225 unsigned long pause_flags;
226 atomic_t pause_count;
227
228 /* VCPU paused for vm_event replies. */
229 atomic_t vm_event_pause_count;
230 /* VCPU paused by system controller. */
231 int controller_pause_count;
232
233 /* Grant table map tracking. */
234 spinlock_t maptrack_freelist_lock;
235 unsigned int maptrack_head;
236 unsigned int maptrack_tail;
237
238 /* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */
239 evtchn_port_t virq_to_evtchn[NR_VIRQS];
240 spinlock_t virq_lock;
241
242 /* Tasklet for continue_hypercall_on_cpu(). */
243 struct tasklet continue_hypercall_tasklet;
244
245 /* Multicall information. */
246 struct mc_state mc_state;
247
248 struct waitqueue_vcpu *waitqueue_vcpu;
249
250 /* Guest-specified relocation of vcpu_info. */
251 mfn_t vcpu_info_mfn;
252
253 struct evtchn_fifo_vcpu *evtchn_fifo;
254
255 /* vPCI per-vCPU area, used to store data for long running operations. */
256 struct vpci_vcpu vpci;
257
258 struct arch_vcpu arch;
259 };
260
261 struct sched_unit {
262 struct domain *domain;
263 struct vcpu *vcpu_list;
264 void *priv; /* scheduler private data */
265 struct sched_unit *next_in_list;
266 struct sched_resource *res;
267 unsigned int unit_id;
268
269 /* Currently running on a CPU? */
270 bool is_running;
271 /* Does soft affinity actually play a role (given hard affinity)? */
272 bool soft_aff_effective;
273 /* Item has been migrated to other cpu(s). */
274 bool migrated;
275
276 /* Last time unit got (de-)scheduled. */
277 uint64_t state_entry_time;
278 /* Vcpu state summary. */
279 unsigned int runstate_cnt[4];
280
281 /* Bitmask of CPUs on which this VCPU may run. */
282 cpumask_var_t cpu_hard_affinity;
283 /* Used to save affinity during temporary pinning. */
284 cpumask_var_t cpu_hard_affinity_saved;
285 /* Bitmask of CPUs on which this VCPU prefers to run. */
286 cpumask_var_t cpu_soft_affinity;
287
288 /* Next unit to run. */
289 struct sched_unit *next_task;
290 s_time_t next_time;
291
292 /* Number of vcpus not yet joined for context switch. */
293 unsigned int rendezvous_in_cnt;
294
295 /* Number of vcpus not yet finished with context switch. */
296 atomic_t rendezvous_out_cnt;
297 };
298
299 #define for_each_sched_unit(d, u) \
300 for ( (u) = (d)->sched_unit_list; (u) != NULL; (u) = (u)->next_in_list )
301
302 /*
303 * All vcpus of a domain are in a single linked list with unit->vcpu_list
304 * pointing to the first vcpu of the unit. The loop must be terminated when
305 * a vcpu is hit not being part of the unit to loop over.
306 */
307 #define for_each_sched_unit_vcpu(u, v) \
308 for ( (v) = (u)->vcpu_list; \
309 (v) != NULL && (!(u)->next_in_list || \
310 (v)->vcpu_id < (u)->next_in_list->unit_id); \
311 (v) = (v)->next_in_list )
312
313 /* Per-domain lock can be recursively acquired in fault handlers. */
314 #define domain_lock(d) spin_lock_recursive(&(d)->domain_lock)
315 #define domain_unlock(d) spin_unlock_recursive(&(d)->domain_lock)
316
317 struct evtchn_port_ops;
318
319 struct domain
320 {
321 domid_t domain_id;
322
323 unsigned int max_vcpus;
324 struct vcpu **vcpu;
325
326 shared_info_t *shared_info; /* shared data area */
327
328 rcu_read_lock_t rcu_lock;
329
330 spinlock_t domain_lock;
331
332 spinlock_t page_alloc_lock; /* protects all the following fields */
333 struct page_list_head page_list; /* linked list */
334 struct page_list_head extra_page_list; /* linked list (size extra_pages) */
335 struct page_list_head xenpage_list; /* linked list (size xenheap_pages) */
336
337 /*
338 * This field should only be directly accessed by domain_adjust_tot_pages()
339 * and the domain_tot_pages() helper function defined below.
340 */
341 unsigned int tot_pages;
342
343 unsigned int xenheap_pages; /* pages allocated from Xen heap */
344 unsigned int outstanding_pages; /* pages claimed but not possessed */
345 unsigned int max_pages; /* maximum value for domain_tot_pages() */
346 unsigned int extra_pages; /* pages not included in domain_tot_pages() */
347 atomic_t shr_pages; /* shared pages */
348 atomic_t paged_pages; /* paged-out pages */
349
350 /* Scheduling. */
351 void *sched_priv; /* scheduler-specific data */
352 struct sched_unit *sched_unit_list;
353 struct cpupool *cpupool;
354
355 struct domain *next_in_list;
356 struct domain *next_in_hashbucket;
357
358 struct list_head rangesets;
359 spinlock_t rangesets_lock;
360
361 /* Event channel information. */
362 struct evtchn *evtchn; /* first bucket only */
363 struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
364 unsigned int max_evtchn_port; /* max permitted port number */
365 unsigned int valid_evtchns; /* number of allocated event channels */
366 /*
367 * Number of in-use event channels. Writers should use write_atomic().
368 * Readers need to use read_atomic() only when not holding event_lock.
369 */
370 unsigned int active_evtchns;
371 /*
372 * Number of event channels used internally by Xen (not subject to
373 * EVTCHNOP_reset). Read/write access like for active_evtchns.
374 */
375 unsigned int xen_evtchns;
376 /* Port to resume from in evtchn_reset(), when in a continuation. */
377 unsigned int next_evtchn;
378 spinlock_t event_lock;
379 const struct evtchn_port_ops *evtchn_port_ops;
380 struct evtchn_fifo_domain *evtchn_fifo;
381
382 struct grant_table *grant_table;
383
384 /*
385 * Interrupt to event-channel mappings and other per-guest-pirq data.
386 * Protected by the domain's event-channel spinlock.
387 */
388 struct radix_tree_root pirq_tree;
389 unsigned int nr_pirqs;
390
391 unsigned int options; /* copy of createdomain flags */
392
393 /* Is this guest dying (i.e., a zombie)? */
394 enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
395
396 /* Domain is paused by controller software? */
397 int controller_pause_count;
398
399 struct {
400 int64_t seconds;
401 bool set;
402 } time_offset;
403
404 #ifdef CONFIG_HAS_PCI
405 struct list_head pdev_list;
406 #endif
407
408 #ifdef CONFIG_HAS_PASSTHROUGH
409 struct domain_iommu iommu;
410 #endif
411 /* is node-affinity automatically computed? */
412 bool auto_node_affinity;
413 /* Is this guest fully privileged (aka dom0)? */
414 bool is_privileged;
415 /* Can this guest access the Xen console? */
416 bool is_console;
417 /* Non-migratable and non-restoreable? */
418 bool disable_migrate;
419 /* Is this guest being debugged by dom0? */
420 bool debugger_attached;
421 /*
422 * Set to true at the very end of domain creation, when the domain is
423 * unpaused for the first time by the systemcontroller.
424 */
425 bool creation_finished;
426
427 /* Which guest this guest has privileges on */
428 struct domain *target;
429
430 /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
431 #if MAX_VIRT_CPUS <= BITS_PER_LONG
432 DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS);
433 #else
434 unsigned long *poll_mask;
435 #endif
436
437 /* I/O capabilities (access to IRQs and memory-mapped I/O). */
438 struct rangeset *iomem_caps;
439 struct rangeset *irq_caps;
440
441 /* Guest has shut down (inc. reason code)? */
442 spinlock_t shutdown_lock;
443 bool is_shutting_down; /* in process of shutting down? */
444 bool is_shut_down; /* fully shut down? */
445 #define SHUTDOWN_CODE_INVALID ~0u
446 unsigned int shutdown_code;
447
448 /* If this is not 0, send suspend notification here instead of
449 * raising DOM_EXC */
450 evtchn_port_t suspend_evtchn;
451
452 atomic_t pause_count;
453 atomic_t refcnt;
454
455 unsigned long vm_assist;
456
457 /* Bitmask of CPUs which are holding onto this domain's state. */
458 cpumask_var_t dirty_cpumask;
459
460 struct arch_domain arch;
461
462 void *ssid; /* sHype security subject identifier */
463
464 /* Control-plane tools handle for this domain. */
465 xen_domain_handle_t handle;
466
467 /* hvm_print_line() and guest_console_write() logging. */
468 #define DOMAIN_PBUF_SIZE 200
469 char *pbuf;
470 unsigned pbuf_idx;
471 spinlock_t pbuf_lock;
472
473 /* OProfile support. */
474 struct xenoprof *xenoprof;
475
476 /* Domain watchdog. */
477 #define NR_DOMAIN_WATCHDOG_TIMERS 2
478 spinlock_t watchdog_lock;
479 uint32_t watchdog_inuse_map;
480 struct timer watchdog_timer[NR_DOMAIN_WATCHDOG_TIMERS];
481
482 struct rcu_head rcu;
483
484 /*
485 * Hypercall deadlock avoidance lock. Used if a hypercall might
486 * cause a deadlock. Acquirers don't spin waiting; they preempt.
487 */
488 spinlock_t hypercall_deadlock_mutex;
489
490 struct lock_profile_qhead profile_head;
491
492 /* Various vm_events */
493
494 /* Memory sharing support */
495 #ifdef CONFIG_MEM_SHARING
496 struct vm_event_domain *vm_event_share;
497 struct domain *parent; /* VM fork parent */
498 #endif
499 /* Memory paging support */
500 #ifdef CONFIG_HAS_MEM_PAGING
501 struct vm_event_domain *vm_event_paging;
502 #endif
503 /* VM event monitor support */
504 struct vm_event_domain *vm_event_monitor;
505
506 /*
507 * Can be specified by the user. If that is not the case, it is
508 * computed from the union of all the vcpu cpu-affinity masks.
509 */
510 nodemask_t node_affinity;
511 unsigned int last_alloc_node;
512 spinlock_t node_affinity_lock;
513
514 /* vNUMA topology accesses are protected by rwlock. */
515 rwlock_t vnuma_rwlock;
516 struct vnuma_info *vnuma;
517
518 /* Common monitor options */
519 struct {
520 unsigned int guest_request_enabled : 1;
521 unsigned int guest_request_sync : 1;
522 } monitor;
523
524 #ifdef CONFIG_ARGO
525 /* Argo interdomain communication support */
526 struct argo_domain *argo;
527 #endif
528 };
529
page_to_list(struct domain * d,const struct page_info * pg)530 static inline struct page_list_head *page_to_list(
531 struct domain *d, const struct page_info *pg)
532 {
533 if ( is_xen_heap_page(pg) )
534 return &d->xenpage_list;
535
536 if ( pg->count_info & PGC_extra )
537 return &d->extra_page_list;
538
539 return &d->page_list;
540 }
541
542 /* Return number of pages currently posessed by the domain */
domain_tot_pages(const struct domain * d)543 static inline unsigned int domain_tot_pages(const struct domain *d)
544 {
545 ASSERT(d->extra_pages <= d->tot_pages);
546
547 return d->tot_pages - d->extra_pages;
548 }
549
550 /* Protect updates/reads (resp.) of domain_list and domain_hash. */
551 extern spinlock_t domlist_update_lock;
552 extern rcu_read_lock_t domlist_read_lock;
553
554 extern struct vcpu *idle_vcpu[NR_CPUS];
555 #define is_idle_domain(d) ((d)->domain_id == DOMID_IDLE)
556 #define is_idle_vcpu(v) (is_idle_domain((v)->domain))
557
is_system_domain(const struct domain * d)558 static inline bool is_system_domain(const struct domain *d)
559 {
560 return d->domain_id >= DOMID_FIRST_RESERVED;
561 }
562
563 #define DOMAIN_DESTROYED (1u << 31) /* assumes atomic_t is >= 32 bits */
564 #define put_domain(_d) \
565 if ( atomic_dec_and_test(&(_d)->refcnt) ) domain_destroy(_d)
566
567 /*
568 * Use this when you don't have an existing reference to @d. It returns
569 * FALSE if @d is being destroyed.
570 */
get_domain(struct domain * d)571 static always_inline bool get_domain(struct domain *d)
572 {
573 int old, seen = atomic_read(&d->refcnt);
574 do
575 {
576 old = seen;
577 if ( unlikely(old & DOMAIN_DESTROYED) )
578 return false;
579 seen = atomic_cmpxchg(&d->refcnt, old, old + 1);
580 }
581 while ( unlikely(seen != old) );
582 return true;
583 }
584
585 /*
586 * Use this when you already have, or are borrowing, a reference to @d.
587 * In this case we know that @d cannot be destroyed under our feet.
588 */
get_knownalive_domain(struct domain * d)589 static inline void get_knownalive_domain(struct domain *d)
590 {
591 atomic_inc(&d->refcnt);
592 ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED));
593 }
594
595 int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity);
596 void domain_update_node_affinity(struct domain *d);
597
598 /*
599 * To be implemented by each architecture, sanity checking the configuration
600 * and filling in any appropriate defaults.
601 */
602 int arch_sanitise_domain_config(struct xen_domctl_createdomain *config);
603
604 /*
605 * Create a domain: the configuration is only necessary for real domain
606 * (domid < DOMID_FIRST_RESERVED).
607 */
608 struct domain *domain_create(domid_t domid,
609 struct xen_domctl_createdomain *config,
610 bool is_priv);
611
612 /*
613 * rcu_lock_domain_by_id() is more efficient than get_domain_by_id().
614 * This is the preferred function if the returned domain reference
615 * is short lived, but it cannot be used if the domain reference needs
616 * to be kept beyond the current scope (e.g., across a softirq).
617 * The returned domain reference must be discarded using rcu_unlock_domain().
618 */
619 struct domain *rcu_lock_domain_by_id(domid_t dom);
620
621 /*
622 * As above function, but resolves DOMID_SELF to current domain
623 */
624 struct domain *rcu_lock_domain_by_any_id(domid_t dom);
625
626 /*
627 * As rcu_lock_domain_by_id(), but will fail EPERM or ESRCH rather than resolve
628 * to local domain.
629 */
630 int rcu_lock_remote_domain_by_id(domid_t dom, struct domain **d);
631
632 /*
633 * As rcu_lock_remote_domain_by_id() but will fail EINVAL if the domain is
634 * dying.
635 */
636 int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d);
637
rcu_unlock_domain(struct domain * d)638 static inline void rcu_unlock_domain(struct domain *d)
639 {
640 if ( d != current->domain )
641 rcu_read_unlock(&d->rcu_lock);
642 }
643
rcu_lock_domain(struct domain * d)644 static inline struct domain *rcu_lock_domain(struct domain *d)
645 {
646 if ( d != current->domain )
647 rcu_read_lock(&d->rcu_lock);
648 return d;
649 }
650
rcu_lock_current_domain(void)651 static inline struct domain *rcu_lock_current_domain(void)
652 {
653 return /*rcu_lock_domain*/(current->domain);
654 }
655
656 struct domain *get_domain_by_id(domid_t dom);
657
658 struct domain *get_pg_owner(domid_t domid);
659
put_pg_owner(struct domain * pg_owner)660 static inline void put_pg_owner(struct domain *pg_owner)
661 {
662 rcu_unlock_domain(pg_owner);
663 }
664
665 void domain_destroy(struct domain *d);
666 int domain_kill(struct domain *d);
667 int domain_shutdown(struct domain *d, u8 reason);
668 void domain_resume(struct domain *d);
669
670 int domain_soft_reset(struct domain *d, bool resuming);
671
672 int vcpu_start_shutdown_deferral(struct vcpu *v);
673 void vcpu_end_shutdown_deferral(struct vcpu *v);
674
675 /*
676 * Mark specified domain as crashed. This function always returns, even if the
677 * caller is the specified domain. The domain is not synchronously descheduled
678 * from any processor.
679 */
680 void __domain_crash(struct domain *d);
681 #define domain_crash(d) do { \
682 printk("domain_crash called from %s:%d\n", __FILE__, __LINE__); \
683 __domain_crash(d); \
684 } while (0)
685
686 /*
687 * Called from assembly code, with an optional address to help indicate why
688 * the crash occurred. If addr is 0, look up address from last extable
689 * redirection.
690 */
691 void noreturn asm_domain_crash_synchronous(unsigned long addr);
692
693 void scheduler_init(void);
694 int sched_init_vcpu(struct vcpu *v);
695 void sched_destroy_vcpu(struct vcpu *v);
696 int sched_init_domain(struct domain *d, int poolid);
697 void sched_destroy_domain(struct domain *d);
698 long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
699 long sched_adjust_global(struct xen_sysctl_scheduler_op *);
700 int sched_id(void);
701 void vcpu_wake(struct vcpu *v);
702 long vcpu_yield(void);
703 void vcpu_sleep_nosync(struct vcpu *v);
704 void vcpu_sleep_sync(struct vcpu *v);
705
706 /*
707 * Force synchronisation of given VCPU's state. If it is currently descheduled,
708 * this call will ensure that all its state is committed to memory and that
709 * no CPU is using critical state (e.g., page tables) belonging to the VCPU.
710 */
711 void sync_vcpu_execstate(struct vcpu *v);
712
713 /* As above, for any lazy state being held on the local CPU. */
714 void sync_local_execstate(void);
715
716 /*
717 * Called by the scheduler to switch to another VCPU. This function must
718 * call sched_context_switched(@prev, @next) when the local CPU is no longer
719 * running in @prev's context, and that context is saved to memory.
720 * Alternatively, if implementing lazy context switching, it suffices to ensure
721 * that invoking sync_vcpu_execstate() will switch and commit @prev's state.
722 */
723 void context_switch(
724 struct vcpu *prev,
725 struct vcpu *next);
726
727 /*
728 * As described above, context_switch() must call this function when the
729 * local CPU is no longer running in @prev's context, and @prev's context is
730 * saved to memory. Alternatively, if implementing lazy context switching,
731 * ensure that invoking sync_vcpu_execstate() will switch and commit @prev.
732 */
733 void sched_context_switched(struct vcpu *prev, struct vcpu *vnext);
734
735 /* Called by the scheduler to continue running the current VCPU. */
736 void continue_running(
737 struct vcpu *same);
738
739 void startup_cpu_idle_loop(void);
740 extern void (*pm_idle) (void);
741 extern void (*dead_idle) (void);
742
743
744 /*
745 * Creates a continuation to resume the current hypercall. The caller should
746 * return immediately, propagating the value returned from this invocation.
747 * The format string specifies the types and number of hypercall arguments.
748 * It contains one character per argument as follows:
749 * 'i' [unsigned] {char, int}
750 * 'l' [unsigned] long
751 * 'h' guest handle (XEN_GUEST_HANDLE(foo))
752 */
753 unsigned long hypercall_create_continuation(
754 unsigned int op, const char *format, ...);
755
hypercall_cancel_continuation(struct vcpu * v)756 static inline void hypercall_cancel_continuation(struct vcpu *v)
757 {
758 v->hcall_preempted = false;
759 }
760
761 /*
762 * For long-running operations that must be in hypercall context, check
763 * if there is background work to be done that should interrupt this
764 * operation.
765 */
766 #define hypercall_preempt_check() (unlikely( \
767 softirq_pending(smp_processor_id()) | \
768 local_events_need_delivery() \
769 ))
770
771 /*
772 * For long-running operations that may be in hypercall context or on
773 * the idle vcpu (e.g. during dom0 construction), check if there is
774 * background work to be done that should interrupt this operation.
775 */
776 #define general_preempt_check() (unlikely( \
777 softirq_pending(smp_processor_id()) || \
778 (!is_idle_vcpu(current) && local_events_need_delivery()) \
779 ))
780
781 extern struct domain *domain_list;
782
783 /* Caller must hold the domlist_read_lock or domlist_update_lock. */
first_domain_in_cpupool(const struct cpupool * c)784 static inline struct domain *first_domain_in_cpupool(const struct cpupool *c)
785 {
786 struct domain *d;
787 for (d = rcu_dereference(domain_list); d && d->cpupool != c;
788 d = rcu_dereference(d->next_in_list));
789 return d;
790 }
next_domain_in_cpupool(struct domain * d,const struct cpupool * c)791 static inline struct domain *next_domain_in_cpupool(
792 struct domain *d, const struct cpupool *c)
793 {
794 for (d = rcu_dereference(d->next_in_list); d && d->cpupool != c;
795 d = rcu_dereference(d->next_in_list));
796 return d;
797 }
798
799 #define for_each_domain(_d) \
800 for ( (_d) = rcu_dereference(domain_list); \
801 (_d) != NULL; \
802 (_d) = rcu_dereference((_d)->next_in_list )) \
803
804 #define for_each_domain_in_cpupool(_d,_c) \
805 for ( (_d) = first_domain_in_cpupool(_c); \
806 (_d) != NULL; \
807 (_d) = next_domain_in_cpupool((_d), (_c)))
808
809 #define for_each_vcpu(_d,_v) \
810 for ( (_v) = (_d)->vcpu ? (_d)->vcpu[0] : NULL; \
811 (_v) != NULL; \
812 (_v) = (_v)->next_in_list )
813
814 /*
815 * Per-VCPU pause flags.
816 */
817 /* Domain is blocked waiting for an event. */
818 #define _VPF_blocked 0
819 #define VPF_blocked (1UL<<_VPF_blocked)
820 /* VCPU is offline. */
821 #define _VPF_down 1
822 #define VPF_down (1UL<<_VPF_down)
823 /* VCPU is blocked awaiting an event to be consumed by Xen. */
824 #define _VPF_blocked_in_xen 2
825 #define VPF_blocked_in_xen (1UL<<_VPF_blocked_in_xen)
826 /* VCPU affinity has changed: migrating to a new CPU. */
827 #define _VPF_migrating 3
828 #define VPF_migrating (1UL<<_VPF_migrating)
829 /* VCPU is blocked due to missing mem_paging ring. */
830 #define _VPF_mem_paging 4
831 #define VPF_mem_paging (1UL<<_VPF_mem_paging)
832 /* VCPU is blocked due to missing mem_access ring. */
833 #define _VPF_mem_access 5
834 #define VPF_mem_access (1UL<<_VPF_mem_access)
835 /* VCPU is blocked due to missing mem_sharing ring. */
836 #define _VPF_mem_sharing 6
837 #define VPF_mem_sharing (1UL<<_VPF_mem_sharing)
838 /* VCPU is being reset. */
839 #define _VPF_in_reset 7
840 #define VPF_in_reset (1UL<<_VPF_in_reset)
841 /* VCPU is parked. */
842 #define _VPF_parked 8
843 #define VPF_parked (1UL<<_VPF_parked)
844
vcpu_runnable(const struct vcpu * v)845 static inline bool vcpu_runnable(const struct vcpu *v)
846 {
847 return !(v->pause_flags |
848 atomic_read(&v->pause_count) |
849 atomic_read(&v->domain->pause_count));
850 }
851
is_vcpu_dirty_cpu(unsigned int cpu)852 static inline bool is_vcpu_dirty_cpu(unsigned int cpu)
853 {
854 BUILD_BUG_ON(NR_CPUS >= VCPU_CPU_CLEAN);
855 return cpu != VCPU_CPU_CLEAN;
856 }
857
vcpu_cpu_dirty(const struct vcpu * v)858 static inline bool vcpu_cpu_dirty(const struct vcpu *v)
859 {
860 return is_vcpu_dirty_cpu(read_atomic(&v->dirty_cpu));
861 }
862
863 void vcpu_block(void);
864 void vcpu_unblock(struct vcpu *v);
865 void vcpu_pause(struct vcpu *v);
866 void vcpu_pause_nosync(struct vcpu *v);
867 void vcpu_unpause(struct vcpu *v);
868 int vcpu_pause_by_systemcontroller(struct vcpu *v);
869 int vcpu_unpause_by_systemcontroller(struct vcpu *v);
870
871 void domain_pause(struct domain *d);
872 void domain_pause_nosync(struct domain *d);
873 void domain_unpause(struct domain *d);
874 int domain_unpause_by_systemcontroller(struct domain *d);
875 int __domain_pause_by_systemcontroller(struct domain *d,
876 void (*pause_fn)(struct domain *d));
domain_pause_by_systemcontroller(struct domain * d)877 static inline int domain_pause_by_systemcontroller(struct domain *d)
878 {
879 return __domain_pause_by_systemcontroller(d, domain_pause);
880 }
domain_pause_by_systemcontroller_nosync(struct domain * d)881 static inline int domain_pause_by_systemcontroller_nosync(struct domain *d)
882 {
883 return __domain_pause_by_systemcontroller(d, domain_pause_nosync);
884 }
885
886 /* domain_pause() but safe against trying to pause current. */
887 int __must_check domain_pause_except_self(struct domain *d);
888 void domain_unpause_except_self(struct domain *d);
889
890 /*
891 * For each allocated vcpu, d->vcpu[X]->vcpu_id == X
892 *
893 * During construction, all vcpus in d->vcpu[] are allocated sequentially, and
894 * in ascending order. Therefore, if d->vcpu[N] exists (e.g. derived from
895 * current), all vcpus with an id less than N also exist.
896 *
897 * SMP considerations: The idle domain is constructed before APs are started.
898 * All other domains have d->vcpu[] allocated and d->max_vcpus set before the
899 * domain is made visible in the domlist, which is serialised on the global
900 * domlist_update_lock.
901 *
902 * Therefore, all observations of d->max_vcpus vs d->vcpu[] will be consistent
903 * despite the lack of smp_* barriers, either by being on the same CPU as the
904 * one which issued the writes, or because of barrier properties of the domain
905 * having been inserted into the domlist.
906 */
domain_vcpu(const struct domain * d,unsigned int vcpu_id)907 static inline struct vcpu *domain_vcpu(const struct domain *d,
908 unsigned int vcpu_id)
909 {
910 unsigned int idx = array_index_nospec(vcpu_id, d->max_vcpus);
911
912 return vcpu_id >= d->max_vcpus ? NULL : d->vcpu[idx];
913 }
914
915 void cpu_init(void);
916
917 /*
918 * vcpu is urgent if vcpu is polling event channel
919 *
920 * if urgent vcpu exists, CPU should not enter deep C state
921 */
922 DECLARE_PER_CPU(atomic_t, sched_urgent_count);
sched_has_urgent_vcpu(void)923 static inline bool sched_has_urgent_vcpu(void)
924 {
925 return atomic_read(&this_cpu(sched_urgent_count));
926 }
927
928 void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value);
929 void sched_setup_dom0_vcpus(struct domain *d);
930 int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason);
931 int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity);
932 void restore_vcpu_affinity(struct domain *d);
933 int vcpu_affinity_domctl(struct domain *d, uint32_t cmd,
934 struct xen_domctl_vcpuaffinity *vcpuaff);
935
936 void vcpu_runstate_get(const struct vcpu *v,
937 struct vcpu_runstate_info *runstate);
938 uint64_t get_cpu_idle_time(unsigned int cpu);
939 void sched_guest_idle(void (*idle) (void), unsigned int cpu);
940 void scheduler_enable(void);
941 void scheduler_disable(void);
942
943 /*
944 * Used by idle loop to decide whether there is work to do:
945 * (1) Deal with RCU; (2) or run softirqs; or (3) Play dead;
946 * or (4) Run tasklets.
947 *
948 * About (3), if a tasklet is enqueued, it will be scheduled
949 * really really soon, and hence it's pointless to try to
950 * sleep between these two events (that's why we don't call
951 * the tasklet_work_to_do() helper).
952 */
953 #define cpu_is_haltable(cpu) \
954 (!rcu_needs_cpu(cpu) && \
955 !softirq_pending(cpu) && \
956 cpu_online(cpu) && \
957 !per_cpu(tasklet_work_to_do, cpu))
958
959 void watchdog_domain_init(struct domain *d);
960 void watchdog_domain_destroy(struct domain *d);
961
962 /*
963 * Use this check when the following are both true:
964 * - Using this feature or interface requires full access to the hardware
965 * (that is, this would not be suitable for a driver domain)
966 * - There is never a reason to deny the hardware domain access to this
967 */
is_hardware_domain(const struct domain * d)968 static always_inline bool is_hardware_domain(const struct domain *d)
969 {
970 if ( IS_ENABLED(CONFIG_PV_SHIM_EXCLUSIVE) )
971 return false;
972
973 return evaluate_nospec(d == hardware_domain);
974 }
975
976 /* This check is for functionality specific to a control domain */
is_control_domain(const struct domain * d)977 static always_inline bool is_control_domain(const struct domain *d)
978 {
979 if ( IS_ENABLED(CONFIG_PV_SHIM_EXCLUSIVE) )
980 return false;
981
982 return evaluate_nospec(d->is_privileged);
983 }
984
985 #define VM_ASSIST(d, t) (test_bit(VMASST_TYPE_ ## t, &(d)->vm_assist))
986
is_pv_domain(const struct domain * d)987 static always_inline bool is_pv_domain(const struct domain *d)
988 {
989 return IS_ENABLED(CONFIG_PV) &&
990 evaluate_nospec(!(d->options & XEN_DOMCTL_CDF_hvm));
991 }
992
is_pv_vcpu(const struct vcpu * v)993 static always_inline bool is_pv_vcpu(const struct vcpu *v)
994 {
995 return is_pv_domain(v->domain);
996 }
997
998 #ifdef CONFIG_COMPAT
is_pv_32bit_domain(const struct domain * d)999 static always_inline bool is_pv_32bit_domain(const struct domain *d)
1000 {
1001 #ifdef CONFIG_PV32
1002 return is_pv_domain(d) && d->arch.pv.is_32bit;
1003 #else
1004 return false;
1005 #endif
1006 }
1007
is_pv_32bit_vcpu(const struct vcpu * v)1008 static always_inline bool is_pv_32bit_vcpu(const struct vcpu *v)
1009 {
1010 return is_pv_32bit_domain(v->domain);
1011 }
1012
is_pv_64bit_domain(const struct domain * d)1013 static always_inline bool is_pv_64bit_domain(const struct domain *d)
1014 {
1015 if ( !is_pv_domain(d) )
1016 return false;
1017
1018 #ifdef CONFIG_PV32
1019 return !d->arch.pv.is_32bit;
1020 #else
1021 return true;
1022 #endif
1023 }
1024
is_pv_64bit_vcpu(const struct vcpu * v)1025 static always_inline bool is_pv_64bit_vcpu(const struct vcpu *v)
1026 {
1027 return is_pv_64bit_domain(v->domain);
1028 }
1029 #endif
is_hvm_domain(const struct domain * d)1030 static always_inline bool is_hvm_domain(const struct domain *d)
1031 {
1032 return IS_ENABLED(CONFIG_HVM) &&
1033 evaluate_nospec(d->options & XEN_DOMCTL_CDF_hvm);
1034 }
1035
is_hvm_vcpu(const struct vcpu * v)1036 static always_inline bool is_hvm_vcpu(const struct vcpu *v)
1037 {
1038 return is_hvm_domain(v->domain);
1039 }
1040
hap_enabled(const struct domain * d)1041 static always_inline bool hap_enabled(const struct domain *d)
1042 {
1043 /* sanitise_domain_config() rejects HAP && !HVM */
1044 return IS_ENABLED(CONFIG_HVM) &&
1045 evaluate_nospec(d->options & XEN_DOMCTL_CDF_hap);
1046 }
1047
is_hwdom_pinned_vcpu(const struct vcpu * v)1048 static inline bool is_hwdom_pinned_vcpu(const struct vcpu *v)
1049 {
1050 return (is_hardware_domain(v->domain) &&
1051 cpumask_weight(v->sched_unit->cpu_hard_affinity) == 1);
1052 }
1053
is_vcpu_online(const struct vcpu * v)1054 static inline bool is_vcpu_online(const struct vcpu *v)
1055 {
1056 return !test_bit(_VPF_down, &v->pause_flags);
1057 }
1058
is_xenstore_domain(const struct domain * d)1059 static inline bool is_xenstore_domain(const struct domain *d)
1060 {
1061 return d->options & XEN_DOMCTL_CDF_xs_domain;
1062 }
1063
is_iommu_enabled(const struct domain * d)1064 static always_inline bool is_iommu_enabled(const struct domain *d)
1065 {
1066 return evaluate_nospec(d->options & XEN_DOMCTL_CDF_iommu);
1067 }
1068
1069 extern bool sched_smt_power_savings;
1070 extern bool sched_disable_smt_switching;
1071
1072 extern enum cpufreq_controller {
1073 FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen
1074 } cpufreq_controller;
1075
is_cpufreq_controller(const struct domain * d)1076 static always_inline bool is_cpufreq_controller(const struct domain *d)
1077 {
1078 /*
1079 * A PV dom0 can be nominated as the cpufreq controller, instead of using
1080 * Xen's cpufreq driver, at which point dom0 gets direct access to certain
1081 * MSRs.
1082 *
1083 * This interface only works when dom0 is identity pinned and has the same
1084 * number of vCPUs as pCPUs on the system.
1085 *
1086 * It would be far better to paravirtualise the interface.
1087 */
1088 return (is_pv_domain(d) && is_hardware_domain(d) &&
1089 cpufreq_controller == FREQCTL_dom0_kernel);
1090 }
1091
1092 int cpupool_move_domain(struct domain *d, struct cpupool *c);
1093 int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op);
1094 int cpupool_get_id(const struct domain *d);
1095 const cpumask_t *cpupool_valid_cpus(const struct cpupool *pool);
1096 extern void dump_runq(unsigned char key);
1097
1098 void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
1099
1100 #endif /* __SCHED_H__ */
1101
1102 /*
1103 * Local variables:
1104 * mode: C
1105 * c-file-style: "BSD"
1106 * c-basic-offset: 4
1107 * tab-width: 4
1108 * indent-tabs-mode: nil
1109 * End:
1110 */
1111