1 /*****************************************************************************
2  * Preemptive Global Earliest Deadline First  (EDF) scheduler for Xen
3  * EDF scheduling is a real-time scheduling algorithm used in embedded field.
4  *
5  * by Sisu Xi, 2013, Washington University in Saint Louis
6  * Meng Xu, 2014-2016, University of Pennsylvania
7  *
8  * Conversion toward event driven model by Tianyang Chen
9  * and Dagaen Golomb, 2016, University of Pennsylvania
10  *
11  * based on the code of credit Scheduler
12  */
13 
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/sched.h>
17 #include <xen/domain.h>
18 #include <xen/delay.h>
19 #include <xen/event.h>
20 #include <xen/time.h>
21 #include <xen/timer.h>
22 #include <xen/perfc.h>
23 #include <xen/softirq.h>
24 #include <asm/atomic.h>
25 #include <xen/errno.h>
26 #include <xen/trace.h>
27 #include <xen/cpu.h>
28 #include <xen/keyhandler.h>
29 #include <xen/trace.h>
30 #include <xen/err.h>
31 #include <xen/guest_access.h>
32 
33 #include "private.h"
34 
35 /*
36  * TODO:
37  *
38  * Migration compensation and resist like credit2 to better use cache;
39  * Lock Holder Problem, using yield?
40  * Self switch problem: UNITs of the same domain may preempt each other;
41  */
42 
43 /*
44  * Design:
45  *
46  * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
47  * theory in real-time field.
48  * At any scheduling point, the UNIT with earlier deadline has higher priority.
49  * The scheduler always picks highest priority UNIT to run on a feasible PCPU.
50  * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or
51  * has a lower-priority UNIT running on it.)
52  *
53  * Each UNIT has a dedicated period, budget and a extratime flag
54  * The deadline of an UNIT is at the end of each period;
55  * An UNIT has its budget replenished at the beginning of each period;
56  * While scheduled, an UNIT burns its budget.
57  * The UNIT needs to finish its budget before its deadline in each period;
58  * The UNIT discards its unused budget at the end of each period.
59  * When an UNIT runs out of budget in a period, if its extratime flag is set,
60  * the UNIT increases its priority_level by 1 and refills its budget; otherwise,
61  * it has to wait until next period.
62  *
63  * Each UNIT is implemented as a deferable server.
64  * When an UNIT has a task running on it, its budget is continuously burned;
65  * When an UNIT has no task but with budget left, its budget is preserved.
66  *
67  * Queue scheme:
68  * A global runqueue and a global depletedqueue for each CPU pool.
69  * The runqueue holds all runnable UNITs with budget,
70  * sorted by priority_level and deadline;
71  * The depletedqueue holds all UNITs without budget, unsorted;
72  *
73  * Note: cpumask and cpupool is supported.
74  */
75 
76 /*
77  * Locking:
78  * A global system lock is used to protect the RunQ and DepletedQ.
79  * The global lock is referenced by sched_res->schedule_lock
80  * from all physical cpus.
81  *
82  * The lock is already grabbed when calling wake/sleep/schedule/ functions
83  * in schedule.c
84  *
85  * The functions involes RunQ and needs to grab locks are:
86  *    unit_insert, unit_remove, context_saved, runq_insert
87  */
88 
89 
90 /*
91  * Default parameters:
92  * Period and budget in default is 10 and 4 ms, respectively
93  */
94 #define RTDS_DEFAULT_PERIOD     (MICROSECS(10000))
95 #define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
96 
97 /*
98  * Max period: max delta of time type, because period is added to the time
99  * an unit activates, so this must not overflow.
100  * Min period: 10 us, considering the scheduling overhead (when period is
101  * too low, scheduling is invoked too frequently, causing high overhead).
102  */
103 #define RTDS_MAX_PERIOD     (STIME_DELTA_MAX)
104 #define RTDS_MIN_PERIOD     (MICROSECS(10))
105 
106 /*
107  * Min budget: 10 us, considering the scheduling overhead (when budget is
108  * consumed too fast, scheduling is invoked too frequently, causing
109  * high overhead).
110  */
111 #define RTDS_MIN_BUDGET     (MICROSECS(10))
112 
113 /*
114  * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
115  * the next deadline, performing addition could be faster if the difference
116  * between cur_deadline and now is small. If the difference is bigger than
117  * 1024 * period, use multiplication.
118  */
119 #define UPDATE_LIMIT_SHIFT      10
120 
121 /*
122  * Flags
123  */
124 /*
125  * RTDS_scheduled: Is this unit either running on, or context-switching off,
126  * a physical cpu?
127  * + Accessed only with global lock held.
128  * + Set when chosen as next in rt_schedule().
129  * + Cleared after context switch has been saved in rt_context_saved()
130  * + Checked in unit_wake to see if we can add to the Runqueue, or if we should
131  *   set RTDS_delayed_runq_add
132  * + Checked to be false in runq_insert.
133  */
134 #define __RTDS_scheduled            1
135 #define RTDS_scheduled (1<<__RTDS_scheduled)
136 /*
137  * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
138  * once it's done being context switching out?
139  * + Set when scheduling out in rt_schedule() if prev is runable
140  * + Set in rt_unit_wake if it finds RTDS_scheduled set
141  * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
142  *   and clears the bit.
143  */
144 #define __RTDS_delayed_runq_add     2
145 #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
146 
147 /*
148  * RTDS_depleted: Does this vcp run out of budget?
149  * This flag is
150  * + set in burn_budget() if an unit has zero budget left;
151  * + cleared and checked in the repenishment handler,
152  *   for the units that are being replenished.
153  */
154 #define __RTDS_depleted     3
155 #define RTDS_depleted (1<<__RTDS_depleted)
156 
157 /*
158  * RTDS_extratime: Can the unit run in the time that is
159  * not part of any real-time reservation, and would therefore
160  * be otherwise left idle?
161  */
162 #define __RTDS_extratime    4
163 #define RTDS_extratime (1<<__RTDS_extratime)
164 
165 /*
166  * rt tracing events ("only" 512 available!). Check
167  * include/public/trace.h for more details.
168  */
169 #define TRC_RTDS_TICKLE           TRC_SCHED_CLASS_EVT(RTDS, 1)
170 #define TRC_RTDS_RUNQ_PICK        TRC_SCHED_CLASS_EVT(RTDS, 2)
171 #define TRC_RTDS_BUDGET_BURN      TRC_SCHED_CLASS_EVT(RTDS, 3)
172 #define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
173 #define TRC_RTDS_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(RTDS, 5)
174 #define TRC_RTDS_SCHEDULE         TRC_SCHED_CLASS_EVT(RTDS, 6)
175 
176 static void repl_timer_handler(void *data);
177 
178 /*
179  * System-wide private data, include global RunQueue/DepletedQ
180  * Global lock is referenced by sched_res->schedule_lock from all
181  * physical cpus. It can be grabbed via unit_schedule_lock_irq()
182  */
183 struct rt_private {
184     spinlock_t lock;            /* the global coarse-grained lock */
185     struct list_head sdom;      /* list of availalbe domains, used for dump */
186 
187     struct list_head runq;      /* ordered list of runnable units */
188     struct list_head depletedq; /* unordered list of depleted units */
189 
190     struct timer repl_timer;    /* replenishment timer */
191     struct list_head replq;     /* ordered list of units that need replenishment */
192 
193     cpumask_t tickled;          /* cpus been tickled */
194 };
195 
196 /*
197  * Virtual CPU
198  */
199 struct rt_unit {
200     struct list_head q_elem;     /* on the runq/depletedq list */
201     struct list_head replq_elem; /* on the replenishment events list */
202 
203     /* UNIT parameters, in nanoseconds */
204     s_time_t period;
205     s_time_t budget;
206 
207     /* UNIT current information in nanosecond */
208     s_time_t cur_budget;         /* current budget */
209     s_time_t last_start;         /* last start time */
210     s_time_t cur_deadline;       /* current deadline for EDF */
211 
212     /* Up-pointers */
213     struct rt_dom *sdom;
214     struct sched_unit *unit;
215 
216     unsigned priority_level;
217 
218     unsigned flags;              /* mark __RTDS_scheduled, etc.. */
219 };
220 
221 /*
222  * Domain
223  */
224 struct rt_dom {
225     struct list_head sdom_elem; /* link list on rt_priv */
226     struct domain *dom;         /* pointer to upper domain */
227 };
228 
229 /*
230  * Useful inline functions
231  */
rt_priv(const struct scheduler * ops)232 static inline struct rt_private *rt_priv(const struct scheduler *ops)
233 {
234     return ops->sched_data;
235 }
236 
rt_unit(const struct sched_unit * unit)237 static inline struct rt_unit *rt_unit(const struct sched_unit *unit)
238 {
239     return unit->priv;
240 }
241 
rt_runq(const struct scheduler * ops)242 static inline struct list_head *rt_runq(const struct scheduler *ops)
243 {
244     return &rt_priv(ops)->runq;
245 }
246 
rt_depletedq(const struct scheduler * ops)247 static inline struct list_head *rt_depletedq(const struct scheduler *ops)
248 {
249     return &rt_priv(ops)->depletedq;
250 }
251 
rt_replq(const struct scheduler * ops)252 static inline struct list_head *rt_replq(const struct scheduler *ops)
253 {
254     return &rt_priv(ops)->replq;
255 }
256 
has_extratime(const struct rt_unit * svc)257 static inline bool has_extratime(const struct rt_unit *svc)
258 {
259     return svc->flags & RTDS_extratime;
260 }
261 
262 /*
263  * Helper functions for manipulating the runqueue, the depleted queue,
264  * and the replenishment events queue.
265  */
266 static int
unit_on_q(const struct rt_unit * svc)267 unit_on_q(const struct rt_unit *svc)
268 {
269    return !list_empty(&svc->q_elem);
270 }
271 
272 static struct rt_unit *
q_elem(struct list_head * elem)273 q_elem(struct list_head *elem)
274 {
275     return list_entry(elem, struct rt_unit, q_elem);
276 }
277 
278 static struct rt_unit *
replq_elem(struct list_head * elem)279 replq_elem(struct list_head *elem)
280 {
281     return list_entry(elem, struct rt_unit, replq_elem);
282 }
283 
284 static int
unit_on_replq(const struct rt_unit * svc)285 unit_on_replq(const struct rt_unit *svc)
286 {
287     return !list_empty(&svc->replq_elem);
288 }
289 
290 /*
291  * If v1 priority >= v2 priority, return value > 0
292  * Otherwise, return value < 0
293  */
294 static s_time_t
compare_unit_priority(const struct rt_unit * v1,const struct rt_unit * v2)295 compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2)
296 {
297     int prio = v2->priority_level - v1->priority_level;
298 
299     if ( prio == 0 )
300         return v2->cur_deadline - v1->cur_deadline;
301 
302     return prio;
303 }
304 
305 /*
306  * Debug related code, dump unit/cpu information
307  */
308 static void
rt_dump_unit(const struct scheduler * ops,const struct rt_unit * svc)309 rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc)
310 {
311     cpumask_t *cpupool_mask, *mask;
312 
313     ASSERT(svc != NULL);
314     /* idle unit */
315     if( svc->sdom == NULL )
316     {
317         printk("\n");
318         return;
319     }
320 
321     /*
322      * We can't just use 'cpumask_scratch' because the dumping can
323      * happen from a pCPU outside of this scheduler's cpupool, and
324      * hence it's not right to use its pCPU's scratch mask.
325      * On the other hand, it is safe to use sched_unit_master(svc->unit)'s
326      * own scratch space, since we hold the runqueue lock.
327      */
328     mask = cpumask_scratch_cpu(sched_unit_master(svc->unit));
329 
330     cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain);
331     cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity);
332     printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
333            " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
334            " \t\t priority_level=%d has_extratime=%d\n"
335            " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n",
336             svc->unit->domain->domain_id,
337             svc->unit->unit_id,
338             sched_unit_master(svc->unit),
339             svc->period,
340             svc->budget,
341             svc->cur_budget,
342             svc->cur_deadline,
343             svc->last_start,
344             svc->priority_level,
345             has_extratime(svc),
346             unit_on_q(svc),
347             unit_runnable(svc->unit),
348             svc->flags, CPUMASK_PR(mask));
349 }
350 
351 static void
rt_dump_pcpu(const struct scheduler * ops,int cpu)352 rt_dump_pcpu(const struct scheduler *ops, int cpu)
353 {
354     struct rt_private *prv = rt_priv(ops);
355     const struct rt_unit *svc;
356     unsigned long flags;
357 
358     spin_lock_irqsave(&prv->lock, flags);
359     printk("CPU[%02d]\n", cpu);
360     /* current UNIT (nothing to say if that's the idle unit). */
361     svc = rt_unit(curr_on_cpu(cpu));
362     if ( svc && !is_idle_unit(svc->unit) )
363     {
364         rt_dump_unit(ops, svc);
365     }
366     spin_unlock_irqrestore(&prv->lock, flags);
367 }
368 
369 static void
rt_dump(const struct scheduler * ops)370 rt_dump(const struct scheduler *ops)
371 {
372     struct list_head *runq, *depletedq, *replq, *iter;
373     struct rt_private *prv = rt_priv(ops);
374     const struct rt_unit *svc;
375     const struct rt_dom *sdom;
376     unsigned long flags;
377 
378     spin_lock_irqsave(&prv->lock, flags);
379 
380     if ( list_empty(&prv->sdom) )
381         goto out;
382 
383     runq = rt_runq(ops);
384     depletedq = rt_depletedq(ops);
385     replq = rt_replq(ops);
386 
387     printk("Global RunQueue info:\n");
388     list_for_each ( iter, runq )
389     {
390         svc = q_elem(iter);
391         rt_dump_unit(ops, svc);
392     }
393 
394     printk("Global DepletedQueue info:\n");
395     list_for_each ( iter, depletedq )
396     {
397         svc = q_elem(iter);
398         rt_dump_unit(ops, svc);
399     }
400 
401     printk("Global Replenishment Events info:\n");
402     list_for_each ( iter, replq )
403     {
404         svc = replq_elem(iter);
405         rt_dump_unit(ops, svc);
406     }
407 
408     printk("Domain info:\n");
409     list_for_each ( iter, &prv->sdom )
410     {
411         const struct sched_unit *unit;
412 
413         sdom = list_entry(iter, struct rt_dom, sdom_elem);
414         printk("\tdomain: %d\n", sdom->dom->domain_id);
415 
416         for_each_sched_unit ( sdom->dom, unit )
417         {
418             svc = rt_unit(unit);
419             rt_dump_unit(ops, svc);
420         }
421     }
422 
423  out:
424     spin_unlock_irqrestore(&prv->lock, flags);
425 }
426 
427 /*
428  * update deadline and budget when now >= cur_deadline
429  * it needs to be updated to the deadline of the current period
430  */
431 static void
rt_update_deadline(s_time_t now,struct rt_unit * svc)432 rt_update_deadline(s_time_t now, struct rt_unit *svc)
433 {
434     ASSERT(now >= svc->cur_deadline);
435     ASSERT(svc->period != 0);
436 
437     if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
438     {
439         do
440             svc->cur_deadline += svc->period;
441         while ( svc->cur_deadline <= now );
442     }
443     else
444     {
445         long count = ((now - svc->cur_deadline) / svc->period) + 1;
446         svc->cur_deadline += count * svc->period;
447     }
448 
449     /*
450      * svc may be scheduled to run immediately after it misses deadline
451      * Then rt_update_deadline is called before rt_schedule, which
452      * should only deduct the time spent in current period from the budget
453      */
454     svc->last_start = now;
455     svc->cur_budget = svc->budget;
456     svc->priority_level = 0;
457 
458     /* TRACE */
459     {
460         struct __packed {
461             unsigned unit:16, dom:16;
462             unsigned priority_level;
463             uint64_t cur_deadline, cur_budget;
464         } d;
465         d.dom = svc->unit->domain->domain_id;
466         d.unit = svc->unit->unit_id;
467         d.priority_level = svc->priority_level;
468         d.cur_deadline = (uint64_t) svc->cur_deadline;
469         d.cur_budget = (uint64_t) svc->cur_budget;
470         trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
471                   sizeof(d),
472                   (unsigned char *) &d);
473     }
474 
475     return;
476 }
477 
478 /*
479  * Helpers for removing and inserting an unit in a queue
480  * that is being kept ordered by the units' deadlines (as EDF
481  * mandates).
482  *
483  * For callers' convenience, the unit removing helper returns
484  * true if the unit removed was the one at the front of the
485  * queue; similarly, the inserting helper returns true if the
486  * inserted ended at the front of the queue (i.e., in both
487  * cases, if the unit with the earliest deadline is what we
488  * are dealing with).
489  */
490 static inline bool
deadline_queue_remove(struct list_head * queue,struct list_head * elem)491 deadline_queue_remove(struct list_head *queue, struct list_head *elem)
492 {
493     bool first = false;
494 
495     if ( queue->next != elem )
496         first = true;
497 
498     list_del_init(elem);
499     return !first;
500 }
501 
502 static inline bool
deadline_queue_insert(struct rt_unit * (* qelem)(struct list_head *),struct rt_unit * svc,struct list_head * elem,struct list_head * queue)503 deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *),
504                       struct rt_unit *svc, struct list_head *elem,
505                       struct list_head *queue)
506 {
507     struct list_head *iter;
508     bool first = true;
509 
510     list_for_each ( iter, queue )
511     {
512         const struct rt_unit * iter_svc = (*qelem)(iter);
513         if ( compare_unit_priority(svc, iter_svc) > 0 )
514             break;
515         first = false;
516     }
517     list_add_tail(elem, iter);
518     return first;
519 }
520 #define deadline_runq_insert(...) \
521   deadline_queue_insert(&q_elem, ##__VA_ARGS__)
522 #define deadline_replq_insert(...) \
523   deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
524 
525 static inline void
q_remove(struct rt_unit * svc)526 q_remove(struct rt_unit *svc)
527 {
528     ASSERT( unit_on_q(svc) );
529     list_del_init(&svc->q_elem);
530 }
531 
532 static inline void
replq_remove(const struct scheduler * ops,struct rt_unit * svc)533 replq_remove(const struct scheduler *ops, struct rt_unit *svc)
534 {
535     struct rt_private *prv = rt_priv(ops);
536     struct list_head *replq = rt_replq(ops);
537 
538     ASSERT( unit_on_replq(svc) );
539 
540     if ( deadline_queue_remove(replq, &svc->replq_elem) )
541     {
542         /*
543          * The replenishment timer needs to be set to fire when a
544          * replenishment for the unit at the front of the replenishment
545          * queue is due. If it is such unit that we just removed, we may
546          * need to reprogram the timer.
547          */
548         if ( !list_empty(replq) )
549         {
550             const struct rt_unit *svc_next = replq_elem(replq->next);
551             set_timer(&prv->repl_timer, svc_next->cur_deadline);
552         }
553         else
554             stop_timer(&prv->repl_timer);
555     }
556 }
557 
558 /*
559  * Insert svc with budget in RunQ according to EDF:
560  * units with smaller deadlines go first.
561  * Insert svc without budget in DepletedQ unsorted;
562  */
563 static void
runq_insert(const struct scheduler * ops,struct rt_unit * svc)564 runq_insert(const struct scheduler *ops, struct rt_unit *svc)
565 {
566     struct rt_private *prv = rt_priv(ops);
567     struct list_head *runq = rt_runq(ops);
568 
569     ASSERT( spin_is_locked(&prv->lock) );
570     ASSERT( !unit_on_q(svc) );
571     ASSERT( unit_on_replq(svc) );
572 
573     /* add svc to runq if svc still has budget or its extratime is set */
574     if ( svc->cur_budget > 0 ||
575          has_extratime(svc) )
576         deadline_runq_insert(svc, &svc->q_elem, runq);
577     else
578         list_add(&svc->q_elem, &prv->depletedq);
579 }
580 
581 static void
replq_insert(const struct scheduler * ops,struct rt_unit * svc)582 replq_insert(const struct scheduler *ops, struct rt_unit *svc)
583 {
584     struct list_head *replq = rt_replq(ops);
585     struct rt_private *prv = rt_priv(ops);
586 
587     ASSERT( !unit_on_replq(svc) );
588 
589     /*
590      * The timer may be re-programmed if svc is inserted
591      * at the front of the event list.
592      */
593     if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
594         set_timer(&prv->repl_timer, svc->cur_deadline);
595 }
596 
597 /*
598  * Removes and re-inserts an event to the replenishment queue.
599  * The aim is to update its position inside the queue, as its
600  * deadline (and hence its replenishment time) could have
601  * changed.
602  */
603 static void
replq_reinsert(const struct scheduler * ops,struct rt_unit * svc)604 replq_reinsert(const struct scheduler *ops, struct rt_unit *svc)
605 {
606     struct list_head *replq = rt_replq(ops);
607     const struct rt_unit *rearm_svc = svc;
608     bool rearm = false;
609 
610     ASSERT( unit_on_replq(svc) );
611 
612     /*
613      * If svc was at the front of the replenishment queue, we certainly
614      * need to re-program the timer, and we want to use the deadline of
615      * the unit which is now at the front of the queue (which may still
616      * be svc or not).
617      *
618      * We may also need to re-program, if svc has been put at the front
619      * of the replenishment queue when being re-inserted.
620      */
621     if ( deadline_queue_remove(replq, &svc->replq_elem) )
622     {
623         deadline_replq_insert(svc, &svc->replq_elem, replq);
624         rearm_svc = replq_elem(replq->next);
625         rearm = true;
626     }
627     else
628         rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
629 
630     if ( rearm )
631         set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
632 }
633 
634 /*
635  * Pick a valid resource for the unit vc
636  * Valid resource of an unit is intesection of unit's affinity
637  * and available resources
638  */
639 static struct sched_resource *
rt_res_pick_locked(const struct sched_unit * unit,unsigned int locked_cpu)640 rt_res_pick_locked(const struct sched_unit *unit, unsigned int locked_cpu)
641 {
642     cpumask_t *cpus = cpumask_scratch_cpu(locked_cpu);
643     const cpumask_t *online;
644     int cpu;
645 
646     online = cpupool_domain_master_cpumask(unit->domain);
647     cpumask_and(cpus, online, unit->cpu_hard_affinity);
648 
649     cpu = cpumask_test_cpu(sched_unit_master(unit), cpus)
650             ? sched_unit_master(unit)
651             : cpumask_cycle(sched_unit_master(unit), cpus);
652     ASSERT( !cpumask_empty(cpus) && cpumask_test_cpu(cpu, cpus) );
653 
654     return get_sched_res(cpu);
655 }
656 
657 /*
658  * Pick a valid resource for the unit vc
659  * Valid resource of an unit is intesection of unit's affinity
660  * and available resources
661  */
662 static struct sched_resource *
rt_res_pick(const struct scheduler * ops,const struct sched_unit * unit)663 rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
664 {
665     struct sched_resource *res;
666 
667     res = rt_res_pick_locked(unit, unit->res->master_cpu);
668 
669     return res;
670 }
671 
672 /*
673  * Init/Free related code
674  */
675 static int
rt_init(struct scheduler * ops)676 rt_init(struct scheduler *ops)
677 {
678     int rc = -ENOMEM;
679     struct rt_private *prv = xzalloc(struct rt_private);
680 
681     printk("Initializing RTDS scheduler\n"
682            "WARNING: This is experimental software in development.\n"
683            "Use at your own risk.\n");
684 
685     if ( prv == NULL )
686         goto err;
687 
688     spin_lock_init(&prv->lock);
689     INIT_LIST_HEAD(&prv->sdom);
690     INIT_LIST_HEAD(&prv->runq);
691     INIT_LIST_HEAD(&prv->depletedq);
692     INIT_LIST_HEAD(&prv->replq);
693 
694     ops->sched_data = prv;
695     rc = 0;
696 
697  err:
698     if ( rc )
699         xfree(prv);
700 
701     return rc;
702 }
703 
704 static void
rt_deinit(struct scheduler * ops)705 rt_deinit(struct scheduler *ops)
706 {
707     struct rt_private *prv = rt_priv(ops);
708 
709     ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid ||
710            prv->repl_timer.status == TIMER_STATUS_killed);
711 
712     ops->sched_data = NULL;
713     xfree(prv);
714 }
715 
716 /* Change the scheduler of cpu to us (RTDS). */
717 static spinlock_t *
rt_switch_sched(struct scheduler * new_ops,unsigned int cpu,void * pdata,void * vdata)718 rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
719                 void *pdata, void *vdata)
720 {
721     struct rt_private *prv = rt_priv(new_ops);
722     struct rt_unit *svc = vdata;
723 
724     ASSERT(!pdata && svc && is_idle_unit(svc->unit));
725 
726     /*
727      * We are holding the runqueue lock already (it's been taken in
728      * schedule_cpu_switch()). It's actually the runqueue lock of
729      * another scheduler, but that is how things need to be, for
730      * preventing races.
731      */
732     ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock);
733 
734     /*
735      * If we are the absolute first cpu being switched toward this
736      * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
737      * first one that is added back to the cpupool that had all its cpus
738      * removed (in which case we'll see TIMER_STATUS_killed), it's our
739      * job to (re)initialize the timer.
740      */
741     if ( prv->repl_timer.status == TIMER_STATUS_invalid ||
742          prv->repl_timer.status == TIMER_STATUS_killed )
743     {
744         init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu);
745         dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
746     }
747 
748     sched_idle_unit(cpu)->priv = vdata;
749 
750     return &prv->lock;
751 }
752 
753 static void
rt_deinit_pdata(const struct scheduler * ops,void * pcpu,int cpu)754 rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
755 {
756     unsigned long flags;
757     struct rt_private *prv = rt_priv(ops);
758 
759     spin_lock_irqsave(&prv->lock, flags);
760 
761     if ( prv->repl_timer.cpu == cpu )
762     {
763         cpumask_t *online = get_sched_res(cpu)->cpupool->res_valid;
764         unsigned int new_cpu = cpumask_cycle(cpu, online);
765 
766         /*
767          * Make sure the timer run on one of the cpus that are still available
768          * to this scheduler. If there aren't any left, it means it's the time
769          * to just kill it.
770          */
771         if ( new_cpu >= nr_cpu_ids )
772         {
773             kill_timer(&prv->repl_timer);
774             dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
775         }
776         else
777         {
778             migrate_timer(&prv->repl_timer, new_cpu);
779         }
780     }
781 
782     spin_unlock_irqrestore(&prv->lock, flags);
783 }
784 
785 static void *
rt_alloc_domdata(const struct scheduler * ops,struct domain * dom)786 rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
787 {
788     unsigned long flags;
789     struct rt_dom *sdom;
790     struct rt_private * prv = rt_priv(ops);
791 
792     sdom = xzalloc(struct rt_dom);
793     if ( sdom == NULL )
794         return ERR_PTR(-ENOMEM);
795 
796     INIT_LIST_HEAD(&sdom->sdom_elem);
797     sdom->dom = dom;
798 
799     /* spinlock here to insert the dom */
800     spin_lock_irqsave(&prv->lock, flags);
801     list_add_tail(&sdom->sdom_elem, &(prv->sdom));
802     spin_unlock_irqrestore(&prv->lock, flags);
803 
804     return sdom;
805 }
806 
807 static void
rt_free_domdata(const struct scheduler * ops,void * data)808 rt_free_domdata(const struct scheduler *ops, void *data)
809 {
810     struct rt_dom *sdom = data;
811     struct rt_private *prv = rt_priv(ops);
812 
813     if ( sdom )
814     {
815         unsigned long flags;
816 
817         spin_lock_irqsave(&prv->lock, flags);
818         list_del_init(&sdom->sdom_elem);
819         spin_unlock_irqrestore(&prv->lock, flags);
820 
821         xfree(sdom);
822     }
823 }
824 
825 static void *
rt_alloc_udata(const struct scheduler * ops,struct sched_unit * unit,void * dd)826 rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd)
827 {
828     struct rt_unit *svc;
829 
830     /* Allocate per-UNIT info */
831     svc = xzalloc(struct rt_unit);
832     if ( svc == NULL )
833         return NULL;
834 
835     INIT_LIST_HEAD(&svc->q_elem);
836     INIT_LIST_HEAD(&svc->replq_elem);
837     svc->flags = 0U;
838     svc->sdom = dd;
839     svc->unit = unit;
840     svc->last_start = 0;
841 
842     __set_bit(__RTDS_extratime, &svc->flags);
843     svc->priority_level = 0;
844     svc->period = RTDS_DEFAULT_PERIOD;
845     if ( !is_idle_unit(unit) )
846         svc->budget = RTDS_DEFAULT_BUDGET;
847 
848     SCHED_STAT_CRANK(unit_alloc);
849 
850     return svc;
851 }
852 
853 static void
rt_free_udata(const struct scheduler * ops,void * priv)854 rt_free_udata(const struct scheduler *ops, void *priv)
855 {
856     struct rt_unit *svc = priv;
857 
858     xfree(svc);
859 }
860 
861 /*
862  * It is called in sched_move_domain() and sched_init_vcpu
863  * in schedule.c.
864  * When move a domain to a new cpupool.
865  * It inserts units of moving domain to the scheduler's RunQ in
866  * dest. cpupool.
867  */
868 static void
rt_unit_insert(const struct scheduler * ops,struct sched_unit * unit)869 rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
870 {
871     struct rt_unit *svc = rt_unit(unit);
872     s_time_t now;
873     spinlock_t *lock;
874     unsigned int cpu = smp_processor_id();
875 
876     BUG_ON( is_idle_unit(unit) );
877 
878     /* This is safe because unit isn't yet being scheduled */
879     lock = pcpu_schedule_lock_irq(cpu);
880     sched_set_res(unit, rt_res_pick_locked(unit, cpu));
881     pcpu_schedule_unlock_irq(lock, cpu);
882 
883     lock = unit_schedule_lock_irq(unit);
884 
885     now = NOW();
886     if ( now >= svc->cur_deadline )
887         rt_update_deadline(now, svc);
888 
889     if ( !unit_on_q(svc) && unit_runnable(unit) )
890     {
891         replq_insert(ops, svc);
892 
893         if ( !unit->is_running )
894             runq_insert(ops, svc);
895     }
896     unit_schedule_unlock_irq(lock, unit);
897 
898     SCHED_STAT_CRANK(unit_insert);
899 }
900 
901 /*
902  * Remove rt_unit svc from the old scheduler in source cpupool.
903  */
904 static void
rt_unit_remove(const struct scheduler * ops,struct sched_unit * unit)905 rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
906 {
907     struct rt_unit * const svc = rt_unit(unit);
908     struct rt_dom * const sdom = svc->sdom;
909     spinlock_t *lock;
910 
911     SCHED_STAT_CRANK(unit_remove);
912 
913     BUG_ON( sdom == NULL );
914 
915     lock = unit_schedule_lock_irq(unit);
916     if ( unit_on_q(svc) )
917         q_remove(svc);
918 
919     if ( unit_on_replq(svc) )
920         replq_remove(ops,svc);
921 
922     unit_schedule_unlock_irq(lock, unit);
923 }
924 
925 /*
926  * Burn budget in nanosecond granularity
927  */
928 static void
burn_budget(const struct scheduler * ops,struct rt_unit * svc,s_time_t now)929 burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now)
930 {
931     s_time_t delta;
932 
933     /* don't burn budget for idle UNIT */
934     if ( is_idle_unit(svc->unit) )
935         return;
936 
937     /* burn at nanoseconds level */
938     delta = now - svc->last_start;
939     /*
940      * delta < 0 only happens in nested virtualization;
941      * TODO: how should we handle delta < 0 in a better way?
942      */
943     if ( delta < 0 )
944     {
945         printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
946                 __func__, delta);
947         svc->last_start = now;
948         return;
949     }
950 
951     svc->cur_budget -= delta;
952     svc->last_start = now;
953 
954     if ( svc->cur_budget <= 0 )
955     {
956         if ( has_extratime(svc) )
957         {
958             svc->priority_level++;
959             svc->cur_budget = svc->budget;
960         }
961         else
962         {
963             svc->cur_budget = 0;
964             __set_bit(__RTDS_depleted, &svc->flags);
965         }
966     }
967 
968     /* TRACE */
969     {
970         struct __packed {
971             unsigned unit:16, dom:16;
972             uint64_t cur_budget;
973             int delta;
974             unsigned priority_level;
975             bool has_extratime;
976         } d;
977         d.dom = svc->unit->domain->domain_id;
978         d.unit = svc->unit->unit_id;
979         d.cur_budget = (uint64_t) svc->cur_budget;
980         d.delta = delta;
981         d.priority_level = svc->priority_level;
982         d.has_extratime = svc->flags & RTDS_extratime;
983         trace_var(TRC_RTDS_BUDGET_BURN, 1,
984                   sizeof(d),
985                   (unsigned char *) &d);
986     }
987 }
988 
989 /*
990  * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
991  * lock is grabbed before calling this function
992  */
993 static struct rt_unit *
runq_pick(const struct scheduler * ops,const cpumask_t * mask,unsigned int cpu)994 runq_pick(const struct scheduler *ops, const cpumask_t *mask, unsigned int cpu)
995 {
996     struct list_head *runq = rt_runq(ops);
997     struct list_head *iter;
998     struct rt_unit *svc = NULL;
999     struct rt_unit *iter_svc = NULL;
1000     cpumask_t *cpu_common = cpumask_scratch_cpu(cpu);
1001     const cpumask_t *online;
1002 
1003     list_for_each ( iter, runq )
1004     {
1005         iter_svc = q_elem(iter);
1006 
1007         /* mask cpu_hard_affinity & cpupool & mask */
1008         online = cpupool_domain_master_cpumask(iter_svc->unit->domain);
1009         cpumask_and(cpu_common, online, iter_svc->unit->cpu_hard_affinity);
1010         cpumask_and(cpu_common, mask, cpu_common);
1011         if ( cpumask_empty(cpu_common) )
1012             continue;
1013 
1014         ASSERT( iter_svc->cur_budget > 0 );
1015 
1016         svc = iter_svc;
1017         break;
1018     }
1019 
1020     /* TRACE */
1021     {
1022         if( svc != NULL )
1023         {
1024             struct __packed {
1025                 unsigned unit:16, dom:16;
1026                 uint64_t cur_deadline, cur_budget;
1027             } d;
1028             d.dom = svc->unit->domain->domain_id;
1029             d.unit = svc->unit->unit_id;
1030             d.cur_deadline = (uint64_t) svc->cur_deadline;
1031             d.cur_budget = (uint64_t) svc->cur_budget;
1032             trace_var(TRC_RTDS_RUNQ_PICK, 1,
1033                       sizeof(d),
1034                       (unsigned char *) &d);
1035         }
1036     }
1037 
1038     return svc;
1039 }
1040 
1041 /*
1042  * schedule function for rt scheduler.
1043  * The lock is already grabbed in schedule.c, no need to lock here
1044  */
1045 static void
rt_schedule(const struct scheduler * ops,struct sched_unit * currunit,s_time_t now,bool tasklet_work_scheduled)1046 rt_schedule(const struct scheduler *ops, struct sched_unit *currunit,
1047             s_time_t now, bool tasklet_work_scheduled)
1048 {
1049     const unsigned int cur_cpu = smp_processor_id();
1050     const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
1051     struct rt_private *prv = rt_priv(ops);
1052     struct rt_unit *const scurr = rt_unit(currunit);
1053     struct rt_unit *snext = NULL;
1054     bool migrated = false;
1055 
1056     /* TRACE */
1057     {
1058         struct __packed {
1059             unsigned cpu:16, tasklet:8, tickled:4, idle:4;
1060         } d;
1061         d.cpu = cur_cpu;
1062         d.tasklet = tasklet_work_scheduled;
1063         d.tickled = cpumask_test_cpu(sched_cpu, &prv->tickled);
1064         d.idle = is_idle_unit(currunit);
1065         trace_var(TRC_RTDS_SCHEDULE, 1,
1066                   sizeof(d),
1067                   (unsigned char *)&d);
1068     }
1069 
1070     /* clear ticked bit now that we've been scheduled */
1071     cpumask_clear_cpu(sched_cpu, &prv->tickled);
1072 
1073     /* burn_budget would return for IDLE UNIT */
1074     burn_budget(ops, scurr, now);
1075 
1076     if ( tasklet_work_scheduled )
1077     {
1078         trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0,  NULL);
1079         snext = rt_unit(sched_idle_unit(sched_cpu));
1080     }
1081     else
1082     {
1083         snext = runq_pick(ops, cpumask_of(sched_cpu), cur_cpu);
1084 
1085         if ( snext == NULL )
1086             snext = rt_unit(sched_idle_unit(sched_cpu));
1087         else if ( !unit_runnable_state(snext->unit) )
1088         {
1089             q_remove(snext);
1090             snext = rt_unit(sched_idle_unit(sched_cpu));
1091         }
1092 
1093         /* if scurr has higher priority and budget, still pick scurr */
1094         if ( !is_idle_unit(currunit) &&
1095              unit_runnable_state(currunit) &&
1096              scurr->cur_budget > 0 &&
1097              ( is_idle_unit(snext->unit) ||
1098                compare_unit_priority(scurr, snext) > 0 ) )
1099             snext = scurr;
1100     }
1101 
1102     if ( snext != scurr &&
1103          !is_idle_unit(currunit) &&
1104          unit_runnable(currunit) )
1105         __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
1106 
1107     snext->last_start = now;
1108     currunit->next_time =  -1; /* if an idle unit is picked */
1109     if ( !is_idle_unit(snext->unit) )
1110     {
1111         if ( snext != scurr )
1112         {
1113             q_remove(snext);
1114             __set_bit(__RTDS_scheduled, &snext->flags);
1115         }
1116         if ( sched_unit_master(snext->unit) != sched_cpu )
1117         {
1118             sched_set_res(snext->unit, get_sched_res(sched_cpu));
1119             migrated = true;
1120         }
1121         /* Invoke the scheduler next time. */
1122         currunit->next_time = snext->cur_budget;
1123     }
1124     currunit->next_task = snext->unit;
1125     snext->unit->migrated = migrated;
1126 }
1127 
1128 /*
1129  * Remove UNIT from RunQ
1130  * The lock is already grabbed in schedule.c, no need to lock here
1131  */
1132 static void
rt_unit_sleep(const struct scheduler * ops,struct sched_unit * unit)1133 rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
1134 {
1135     struct rt_unit * const svc = rt_unit(unit);
1136 
1137     BUG_ON( is_idle_unit(unit) );
1138     SCHED_STAT_CRANK(unit_sleep);
1139 
1140     if ( curr_on_cpu(sched_unit_master(unit)) == unit )
1141         cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
1142     else if ( unit_on_q(svc) )
1143     {
1144         q_remove(svc);
1145         replq_remove(ops, svc);
1146     }
1147     else if ( svc->flags & RTDS_delayed_runq_add )
1148         __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
1149 }
1150 
1151 /*
1152  * Pick a cpu where to run an unit,
1153  * possibly kicking out the unit running there
1154  * Called by wake() and context_saved()
1155  * We have a running candidate here, the kick logic is:
1156  * Among all the cpus that are within the cpu affinity
1157  * 1) if there are any idle CPUs, kick one.
1158       For cache benefit, we check new->cpu as first
1159  * 2) now all pcpus are busy;
1160  *    among all the running units, pick lowest priority one
1161  *    if snext has higher priority, kick it.
1162  *
1163  * TODO:
1164  * 1) what if these two units belongs to the same domain?
1165  *    replace an unit belonging to the same domain introduces more overhead
1166  *
1167  * lock is grabbed before calling this function
1168  */
1169 static void
runq_tickle(const struct scheduler * ops,const struct rt_unit * new)1170 runq_tickle(const struct scheduler *ops, const struct rt_unit *new)
1171 {
1172     struct rt_private *prv = rt_priv(ops);
1173     const struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */
1174     const struct rt_unit *iter_svc;
1175     const struct sched_unit *iter_unit;
1176     int cpu = 0, cpu_to_tickle = 0;
1177     cpumask_t *not_tickled = cpumask_scratch_cpu(smp_processor_id());
1178     const cpumask_t *online;
1179 
1180     if ( new == NULL || is_idle_unit(new->unit) )
1181         return;
1182 
1183     online = cpupool_domain_master_cpumask(new->unit->domain);
1184     cpumask_and(not_tickled, online, new->unit->cpu_hard_affinity);
1185     cpumask_andnot(not_tickled, not_tickled, &prv->tickled);
1186 
1187     /*
1188      * 1) If there are any idle CPUs, kick one.
1189      *    For cache benefit,we first search new->cpu.
1190      *    The same loop also find the one with lowest priority.
1191      */
1192     cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), not_tickled);
1193     while ( cpu!= nr_cpu_ids )
1194     {
1195         iter_unit = curr_on_cpu(cpu);
1196         if ( is_idle_unit(iter_unit) )
1197         {
1198             SCHED_STAT_CRANK(tickled_idle_cpu);
1199             cpu_to_tickle = cpu;
1200             goto out;
1201         }
1202         iter_svc = rt_unit(iter_unit);
1203         if ( latest_deadline_unit == NULL ||
1204              compare_unit_priority(iter_svc, latest_deadline_unit) < 0 )
1205             latest_deadline_unit = iter_svc;
1206 
1207         cpumask_clear_cpu(cpu, not_tickled);
1208         cpu = cpumask_cycle(cpu, not_tickled);
1209     }
1210 
1211     /* 2) candicate has higher priority, kick out lowest priority unit */
1212     if ( latest_deadline_unit != NULL &&
1213          compare_unit_priority(latest_deadline_unit, new) < 0 )
1214     {
1215         SCHED_STAT_CRANK(tickled_busy_cpu);
1216         cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit);
1217         goto out;
1218     }
1219 
1220     /* didn't tickle any cpu */
1221     SCHED_STAT_CRANK(tickled_no_cpu);
1222     return;
1223  out:
1224     /* TRACE */
1225     {
1226         struct {
1227             unsigned cpu:16, pad:16;
1228         } d;
1229         d.cpu = cpu_to_tickle;
1230         d.pad = 0;
1231         trace_var(TRC_RTDS_TICKLE, 1,
1232                   sizeof(d),
1233                   (unsigned char *)&d);
1234     }
1235 
1236     cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
1237     cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
1238     return;
1239 }
1240 
1241 /*
1242  * Should always wake up runnable unit, put it back to RunQ.
1243  * Check priority to raise interrupt
1244  * The lock is already grabbed in schedule.c, no need to lock here
1245  * TODO: what if these two units belongs to the same domain?
1246  */
1247 static void
rt_unit_wake(const struct scheduler * ops,struct sched_unit * unit)1248 rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
1249 {
1250     struct rt_unit * const svc = rt_unit(unit);
1251     s_time_t now;
1252     bool missed;
1253 
1254     BUG_ON( is_idle_unit(unit) );
1255 
1256     if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
1257     {
1258         SCHED_STAT_CRANK(unit_wake_running);
1259         return;
1260     }
1261 
1262     /* on RunQ/DepletedQ, just update info is ok */
1263     if ( unlikely(unit_on_q(svc)) )
1264     {
1265         SCHED_STAT_CRANK(unit_wake_onrunq);
1266         return;
1267     }
1268 
1269     if ( likely(unit_runnable(unit)) )
1270         SCHED_STAT_CRANK(unit_wake_runnable);
1271     else
1272         SCHED_STAT_CRANK(unit_wake_not_runnable);
1273 
1274     /*
1275      * If a deadline passed while svc was asleep/blocked, we need new
1276      * scheduling parameters (a new deadline and full budget).
1277      */
1278     now = NOW();
1279 
1280     missed = ( now >= svc->cur_deadline );
1281     if ( missed )
1282         rt_update_deadline(now, svc);
1283 
1284     /*
1285      * If context hasn't been saved for this unit yet, we can't put it on
1286      * the run-queue/depleted-queue. Instead, we set the appropriate flag,
1287      * the unit will be put back on queue after the context has been saved
1288      * (in rt_context_save()).
1289      */
1290     if ( unlikely(svc->flags & RTDS_scheduled) )
1291     {
1292         __set_bit(__RTDS_delayed_runq_add, &svc->flags);
1293         /*
1294          * The unit is waking up already, and we didn't even had the time to
1295          * remove its next replenishment event from the replenishment queue
1296          * when it blocked! No big deal. If we did not miss the deadline in
1297          * the meantime, let's just leave it there. If we did, let's remove it
1298          * and queue a new one (to occur at our new deadline).
1299          */
1300         if ( missed )
1301            replq_reinsert(ops, svc);
1302         return;
1303     }
1304 
1305     /* Replenishment event got cancelled when we blocked. Add it back. */
1306     replq_insert(ops, svc);
1307     /* insert svc to runq/depletedq because svc is not in queue now */
1308     runq_insert(ops, svc);
1309 
1310     runq_tickle(ops, svc);
1311 }
1312 
1313 /*
1314  * scurr has finished context switch, insert it back to the RunQ,
1315  * and then pick the highest priority unit from runq to run
1316  */
1317 static void
rt_context_saved(const struct scheduler * ops,struct sched_unit * unit)1318 rt_context_saved(const struct scheduler *ops, struct sched_unit *unit)
1319 {
1320     struct rt_unit *svc = rt_unit(unit);
1321     spinlock_t *lock = unit_schedule_lock_irq(unit);
1322 
1323     __clear_bit(__RTDS_scheduled, &svc->flags);
1324     /* not insert idle unit to runq */
1325     if ( is_idle_unit(unit) )
1326         goto out;
1327 
1328     if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
1329          likely(unit_runnable(unit)) )
1330     {
1331         runq_insert(ops, svc);
1332         runq_tickle(ops, svc);
1333     }
1334     else
1335         replq_remove(ops, svc);
1336 
1337 out:
1338     unit_schedule_unlock_irq(lock, unit);
1339 }
1340 
1341 /*
1342  * set/get each unit info of each domain
1343  */
1344 static int
rt_dom_cntl(const struct scheduler * ops,struct domain * d,struct xen_domctl_scheduler_op * op)1345 rt_dom_cntl(
1346     const struct scheduler *ops,
1347     struct domain *d,
1348     struct xen_domctl_scheduler_op *op)
1349 {
1350     struct rt_private *prv = rt_priv(ops);
1351     struct rt_unit *svc;
1352     const struct sched_unit *unit;
1353     unsigned long flags;
1354     int rc = 0;
1355     struct xen_domctl_schedparam_vcpu local_sched;
1356     s_time_t period, budget;
1357     uint32_t index = 0;
1358 
1359     switch ( op->cmd )
1360     {
1361     case XEN_DOMCTL_SCHEDOP_getinfo:
1362         /* Return the default parameters. */
1363         op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
1364         op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
1365         break;
1366     case XEN_DOMCTL_SCHEDOP_putinfo:
1367         if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
1368         {
1369             rc = -EINVAL;
1370             break;
1371         }
1372         spin_lock_irqsave(&prv->lock, flags);
1373         for_each_sched_unit ( d, unit )
1374         {
1375             svc = rt_unit(unit);
1376             svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
1377             svc->budget = MICROSECS(op->u.rtds.budget);
1378         }
1379         spin_unlock_irqrestore(&prv->lock, flags);
1380         break;
1381     case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
1382     case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
1383         while ( index < op->u.v.nr_vcpus )
1384         {
1385             if ( copy_from_guest_offset(&local_sched,
1386                                         op->u.v.vcpus, index, 1) )
1387             {
1388                 rc = -EFAULT;
1389                 break;
1390             }
1391             if ( local_sched.vcpuid >= d->max_vcpus ||
1392                  d->vcpu[local_sched.vcpuid] == NULL )
1393             {
1394                 rc = -EINVAL;
1395                 break;
1396             }
1397 
1398             if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
1399             {
1400                 spin_lock_irqsave(&prv->lock, flags);
1401                 svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
1402                 local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
1403                 local_sched.u.rtds.period = svc->period / MICROSECS(1);
1404                 if ( has_extratime(svc) )
1405                     local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
1406                 else
1407                     local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
1408                 spin_unlock_irqrestore(&prv->lock, flags);
1409 
1410                 if ( copy_to_guest_offset(op->u.v.vcpus, index,
1411                                           &local_sched, 1) )
1412                 {
1413                     rc = -EFAULT;
1414                     break;
1415                 }
1416             }
1417             else
1418             {
1419                 period = MICROSECS(local_sched.u.rtds.period);
1420                 budget = MICROSECS(local_sched.u.rtds.budget);
1421                 if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
1422                      budget > period || period < RTDS_MIN_PERIOD )
1423                 {
1424                     rc = -EINVAL;
1425                     break;
1426                 }
1427 
1428                 spin_lock_irqsave(&prv->lock, flags);
1429                 svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
1430                 svc->period = period;
1431                 svc->budget = budget;
1432                 if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
1433                     __set_bit(__RTDS_extratime, &svc->flags);
1434                 else
1435                     __clear_bit(__RTDS_extratime, &svc->flags);
1436                 spin_unlock_irqrestore(&prv->lock, flags);
1437             }
1438             /* Process a most 64 vCPUs without checking for preemptions. */
1439             if ( (++index > 63) && hypercall_preempt_check() )
1440                 break;
1441         }
1442         if ( !rc )
1443             /* notify upper caller how many units have been processed. */
1444             op->u.v.nr_vcpus = index;
1445         break;
1446     }
1447 
1448     return rc;
1449 }
1450 
1451 /*
1452  * The replenishment timer handler picks units
1453  * from the replq and does the actual replenishment.
1454  */
repl_timer_handler(void * data)1455 static void repl_timer_handler(void *data){
1456     s_time_t now;
1457     const struct scheduler *ops = data;
1458     struct rt_private *prv = rt_priv(ops);
1459     struct list_head *replq = rt_replq(ops);
1460     struct list_head *runq = rt_runq(ops);
1461     struct list_head *iter, *tmp;
1462     struct rt_unit *svc;
1463     LIST_HEAD(tmp_replq);
1464 
1465     spin_lock_irq(&prv->lock);
1466 
1467     now = NOW();
1468 
1469     /*
1470      * Do the replenishment and move replenished units
1471      * to the temporary list to tickle.
1472      * If svc is on run queue, we need to put it at
1473      * the correct place since its deadline changes.
1474      */
1475     list_for_each_safe ( iter, tmp, replq )
1476     {
1477         svc = replq_elem(iter);
1478 
1479         if ( now < svc->cur_deadline )
1480             break;
1481 
1482         list_del(&svc->replq_elem);
1483         rt_update_deadline(now, svc);
1484         list_add(&svc->replq_elem, &tmp_replq);
1485 
1486         if ( unit_on_q(svc) )
1487         {
1488             q_remove(svc);
1489             runq_insert(ops, svc);
1490         }
1491     }
1492 
1493     /*
1494      * Iterate through the list of updated units.
1495      * If an updated unit is running, tickle the head of the
1496      * runqueue if it has a higher priority.
1497      * If an updated unit was depleted and on the runqueue, tickle it.
1498      * Finally, reinsert the units back to replenishement events list.
1499      */
1500     list_for_each_safe ( iter, tmp, &tmp_replq )
1501     {
1502         svc = replq_elem(iter);
1503 
1504         if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit &&
1505              !list_empty(runq) )
1506         {
1507             struct rt_unit *next_on_runq = q_elem(runq->next);
1508 
1509             if ( compare_unit_priority(svc, next_on_runq) < 0 )
1510                 runq_tickle(ops, next_on_runq);
1511         }
1512         else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
1513                   unit_on_q(svc) )
1514             runq_tickle(ops, svc);
1515 
1516         list_del(&svc->replq_elem);
1517         deadline_replq_insert(svc, &svc->replq_elem, replq);
1518     }
1519 
1520     /*
1521      * If there are units left in the replenishment event list,
1522      * set the next replenishment to happen at the deadline of
1523      * the one in the front.
1524      */
1525     if ( !list_empty(replq) )
1526         set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline);
1527 
1528     spin_unlock_irq(&prv->lock);
1529 }
1530 
1531 static const struct scheduler sched_rtds_def = {
1532     .name           = "SMP RTDS Scheduler",
1533     .opt_name       = "rtds",
1534     .sched_id       = XEN_SCHEDULER_RTDS,
1535     .sched_data     = NULL,
1536 
1537     .dump_cpu_state = rt_dump_pcpu,
1538     .dump_settings  = rt_dump,
1539     .init           = rt_init,
1540     .deinit         = rt_deinit,
1541     .switch_sched   = rt_switch_sched,
1542     .deinit_pdata   = rt_deinit_pdata,
1543     .alloc_domdata  = rt_alloc_domdata,
1544     .free_domdata   = rt_free_domdata,
1545     .alloc_udata    = rt_alloc_udata,
1546     .free_udata     = rt_free_udata,
1547     .insert_unit    = rt_unit_insert,
1548     .remove_unit    = rt_unit_remove,
1549 
1550     .adjust         = rt_dom_cntl,
1551 
1552     .pick_resource  = rt_res_pick,
1553     .do_schedule    = rt_schedule,
1554     .sleep          = rt_unit_sleep,
1555     .wake           = rt_unit_wake,
1556     .context_saved  = rt_context_saved,
1557 };
1558 
1559 REGISTER_SCHEDULER(sched_rtds_def);
1560