1 /*****************************************************************************
2 * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen
3 * EDF scheduling is a real-time scheduling algorithm used in embedded field.
4 *
5 * by Sisu Xi, 2013, Washington University in Saint Louis
6 * Meng Xu, 2014-2016, University of Pennsylvania
7 *
8 * Conversion toward event driven model by Tianyang Chen
9 * and Dagaen Golomb, 2016, University of Pennsylvania
10 *
11 * based on the code of credit Scheduler
12 */
13
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/sched.h>
17 #include <xen/domain.h>
18 #include <xen/delay.h>
19 #include <xen/event.h>
20 #include <xen/time.h>
21 #include <xen/timer.h>
22 #include <xen/perfc.h>
23 #include <xen/softirq.h>
24 #include <asm/atomic.h>
25 #include <xen/errno.h>
26 #include <xen/trace.h>
27 #include <xen/cpu.h>
28 #include <xen/keyhandler.h>
29 #include <xen/trace.h>
30 #include <xen/err.h>
31 #include <xen/guest_access.h>
32
33 #include "private.h"
34
35 /*
36 * TODO:
37 *
38 * Migration compensation and resist like credit2 to better use cache;
39 * Lock Holder Problem, using yield?
40 * Self switch problem: UNITs of the same domain may preempt each other;
41 */
42
43 /*
44 * Design:
45 *
46 * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
47 * theory in real-time field.
48 * At any scheduling point, the UNIT with earlier deadline has higher priority.
49 * The scheduler always picks highest priority UNIT to run on a feasible PCPU.
50 * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or
51 * has a lower-priority UNIT running on it.)
52 *
53 * Each UNIT has a dedicated period, budget and a extratime flag
54 * The deadline of an UNIT is at the end of each period;
55 * An UNIT has its budget replenished at the beginning of each period;
56 * While scheduled, an UNIT burns its budget.
57 * The UNIT needs to finish its budget before its deadline in each period;
58 * The UNIT discards its unused budget at the end of each period.
59 * When an UNIT runs out of budget in a period, if its extratime flag is set,
60 * the UNIT increases its priority_level by 1 and refills its budget; otherwise,
61 * it has to wait until next period.
62 *
63 * Each UNIT is implemented as a deferable server.
64 * When an UNIT has a task running on it, its budget is continuously burned;
65 * When an UNIT has no task but with budget left, its budget is preserved.
66 *
67 * Queue scheme:
68 * A global runqueue and a global depletedqueue for each CPU pool.
69 * The runqueue holds all runnable UNITs with budget,
70 * sorted by priority_level and deadline;
71 * The depletedqueue holds all UNITs without budget, unsorted;
72 *
73 * Note: cpumask and cpupool is supported.
74 */
75
76 /*
77 * Locking:
78 * A global system lock is used to protect the RunQ and DepletedQ.
79 * The global lock is referenced by sched_res->schedule_lock
80 * from all physical cpus.
81 *
82 * The lock is already grabbed when calling wake/sleep/schedule/ functions
83 * in schedule.c
84 *
85 * The functions involes RunQ and needs to grab locks are:
86 * unit_insert, unit_remove, context_saved, runq_insert
87 */
88
89
90 /*
91 * Default parameters:
92 * Period and budget in default is 10 and 4 ms, respectively
93 */
94 #define RTDS_DEFAULT_PERIOD (MICROSECS(10000))
95 #define RTDS_DEFAULT_BUDGET (MICROSECS(4000))
96
97 /*
98 * Max period: max delta of time type, because period is added to the time
99 * an unit activates, so this must not overflow.
100 * Min period: 10 us, considering the scheduling overhead (when period is
101 * too low, scheduling is invoked too frequently, causing high overhead).
102 */
103 #define RTDS_MAX_PERIOD (STIME_DELTA_MAX)
104 #define RTDS_MIN_PERIOD (MICROSECS(10))
105
106 /*
107 * Min budget: 10 us, considering the scheduling overhead (when budget is
108 * consumed too fast, scheduling is invoked too frequently, causing
109 * high overhead).
110 */
111 #define RTDS_MIN_BUDGET (MICROSECS(10))
112
113 /*
114 * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
115 * the next deadline, performing addition could be faster if the difference
116 * between cur_deadline and now is small. If the difference is bigger than
117 * 1024 * period, use multiplication.
118 */
119 #define UPDATE_LIMIT_SHIFT 10
120
121 /*
122 * Flags
123 */
124 /*
125 * RTDS_scheduled: Is this unit either running on, or context-switching off,
126 * a physical cpu?
127 * + Accessed only with global lock held.
128 * + Set when chosen as next in rt_schedule().
129 * + Cleared after context switch has been saved in rt_context_saved()
130 * + Checked in unit_wake to see if we can add to the Runqueue, or if we should
131 * set RTDS_delayed_runq_add
132 * + Checked to be false in runq_insert.
133 */
134 #define __RTDS_scheduled 1
135 #define RTDS_scheduled (1<<__RTDS_scheduled)
136 /*
137 * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
138 * once it's done being context switching out?
139 * + Set when scheduling out in rt_schedule() if prev is runable
140 * + Set in rt_unit_wake if it finds RTDS_scheduled set
141 * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
142 * and clears the bit.
143 */
144 #define __RTDS_delayed_runq_add 2
145 #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
146
147 /*
148 * RTDS_depleted: Does this vcp run out of budget?
149 * This flag is
150 * + set in burn_budget() if an unit has zero budget left;
151 * + cleared and checked in the repenishment handler,
152 * for the units that are being replenished.
153 */
154 #define __RTDS_depleted 3
155 #define RTDS_depleted (1<<__RTDS_depleted)
156
157 /*
158 * RTDS_extratime: Can the unit run in the time that is
159 * not part of any real-time reservation, and would therefore
160 * be otherwise left idle?
161 */
162 #define __RTDS_extratime 4
163 #define RTDS_extratime (1<<__RTDS_extratime)
164
165 /*
166 * rt tracing events ("only" 512 available!). Check
167 * include/public/trace.h for more details.
168 */
169 #define TRC_RTDS_TICKLE TRC_SCHED_CLASS_EVT(RTDS, 1)
170 #define TRC_RTDS_RUNQ_PICK TRC_SCHED_CLASS_EVT(RTDS, 2)
171 #define TRC_RTDS_BUDGET_BURN TRC_SCHED_CLASS_EVT(RTDS, 3)
172 #define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
173 #define TRC_RTDS_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RTDS, 5)
174 #define TRC_RTDS_SCHEDULE TRC_SCHED_CLASS_EVT(RTDS, 6)
175
176 static void repl_timer_handler(void *data);
177
178 /*
179 * System-wide private data, include global RunQueue/DepletedQ
180 * Global lock is referenced by sched_res->schedule_lock from all
181 * physical cpus. It can be grabbed via unit_schedule_lock_irq()
182 */
183 struct rt_private {
184 spinlock_t lock; /* the global coarse-grained lock */
185 struct list_head sdom; /* list of availalbe domains, used for dump */
186
187 struct list_head runq; /* ordered list of runnable units */
188 struct list_head depletedq; /* unordered list of depleted units */
189
190 struct timer repl_timer; /* replenishment timer */
191 struct list_head replq; /* ordered list of units that need replenishment */
192
193 cpumask_t tickled; /* cpus been tickled */
194 };
195
196 /*
197 * Virtual CPU
198 */
199 struct rt_unit {
200 struct list_head q_elem; /* on the runq/depletedq list */
201 struct list_head replq_elem; /* on the replenishment events list */
202
203 /* UNIT parameters, in nanoseconds */
204 s_time_t period;
205 s_time_t budget;
206
207 /* UNIT current information in nanosecond */
208 s_time_t cur_budget; /* current budget */
209 s_time_t last_start; /* last start time */
210 s_time_t cur_deadline; /* current deadline for EDF */
211
212 /* Up-pointers */
213 struct rt_dom *sdom;
214 struct sched_unit *unit;
215
216 unsigned priority_level;
217
218 unsigned flags; /* mark __RTDS_scheduled, etc.. */
219 };
220
221 /*
222 * Domain
223 */
224 struct rt_dom {
225 struct list_head sdom_elem; /* link list on rt_priv */
226 struct domain *dom; /* pointer to upper domain */
227 };
228
229 /*
230 * Useful inline functions
231 */
rt_priv(const struct scheduler * ops)232 static inline struct rt_private *rt_priv(const struct scheduler *ops)
233 {
234 return ops->sched_data;
235 }
236
rt_unit(const struct sched_unit * unit)237 static inline struct rt_unit *rt_unit(const struct sched_unit *unit)
238 {
239 return unit->priv;
240 }
241
rt_runq(const struct scheduler * ops)242 static inline struct list_head *rt_runq(const struct scheduler *ops)
243 {
244 return &rt_priv(ops)->runq;
245 }
246
rt_depletedq(const struct scheduler * ops)247 static inline struct list_head *rt_depletedq(const struct scheduler *ops)
248 {
249 return &rt_priv(ops)->depletedq;
250 }
251
rt_replq(const struct scheduler * ops)252 static inline struct list_head *rt_replq(const struct scheduler *ops)
253 {
254 return &rt_priv(ops)->replq;
255 }
256
has_extratime(const struct rt_unit * svc)257 static inline bool has_extratime(const struct rt_unit *svc)
258 {
259 return svc->flags & RTDS_extratime;
260 }
261
262 /*
263 * Helper functions for manipulating the runqueue, the depleted queue,
264 * and the replenishment events queue.
265 */
266 static int
unit_on_q(const struct rt_unit * svc)267 unit_on_q(const struct rt_unit *svc)
268 {
269 return !list_empty(&svc->q_elem);
270 }
271
272 static struct rt_unit *
q_elem(struct list_head * elem)273 q_elem(struct list_head *elem)
274 {
275 return list_entry(elem, struct rt_unit, q_elem);
276 }
277
278 static struct rt_unit *
replq_elem(struct list_head * elem)279 replq_elem(struct list_head *elem)
280 {
281 return list_entry(elem, struct rt_unit, replq_elem);
282 }
283
284 static int
unit_on_replq(const struct rt_unit * svc)285 unit_on_replq(const struct rt_unit *svc)
286 {
287 return !list_empty(&svc->replq_elem);
288 }
289
290 /*
291 * If v1 priority >= v2 priority, return value > 0
292 * Otherwise, return value < 0
293 */
294 static s_time_t
compare_unit_priority(const struct rt_unit * v1,const struct rt_unit * v2)295 compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2)
296 {
297 int prio = v2->priority_level - v1->priority_level;
298
299 if ( prio == 0 )
300 return v2->cur_deadline - v1->cur_deadline;
301
302 return prio;
303 }
304
305 /*
306 * Debug related code, dump unit/cpu information
307 */
308 static void
rt_dump_unit(const struct scheduler * ops,const struct rt_unit * svc)309 rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc)
310 {
311 cpumask_t *cpupool_mask, *mask;
312
313 ASSERT(svc != NULL);
314 /* idle unit */
315 if( svc->sdom == NULL )
316 {
317 printk("\n");
318 return;
319 }
320
321 /*
322 * We can't just use 'cpumask_scratch' because the dumping can
323 * happen from a pCPU outside of this scheduler's cpupool, and
324 * hence it's not right to use its pCPU's scratch mask.
325 * On the other hand, it is safe to use sched_unit_master(svc->unit)'s
326 * own scratch space, since we hold the runqueue lock.
327 */
328 mask = cpumask_scratch_cpu(sched_unit_master(svc->unit));
329
330 cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain);
331 cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity);
332 printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
333 " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
334 " \t\t priority_level=%d has_extratime=%d\n"
335 " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n",
336 svc->unit->domain->domain_id,
337 svc->unit->unit_id,
338 sched_unit_master(svc->unit),
339 svc->period,
340 svc->budget,
341 svc->cur_budget,
342 svc->cur_deadline,
343 svc->last_start,
344 svc->priority_level,
345 has_extratime(svc),
346 unit_on_q(svc),
347 unit_runnable(svc->unit),
348 svc->flags, CPUMASK_PR(mask));
349 }
350
351 static void
rt_dump_pcpu(const struct scheduler * ops,int cpu)352 rt_dump_pcpu(const struct scheduler *ops, int cpu)
353 {
354 struct rt_private *prv = rt_priv(ops);
355 const struct rt_unit *svc;
356 unsigned long flags;
357
358 spin_lock_irqsave(&prv->lock, flags);
359 printk("CPU[%02d]\n", cpu);
360 /* current UNIT (nothing to say if that's the idle unit). */
361 svc = rt_unit(curr_on_cpu(cpu));
362 if ( svc && !is_idle_unit(svc->unit) )
363 {
364 rt_dump_unit(ops, svc);
365 }
366 spin_unlock_irqrestore(&prv->lock, flags);
367 }
368
369 static void
rt_dump(const struct scheduler * ops)370 rt_dump(const struct scheduler *ops)
371 {
372 struct list_head *runq, *depletedq, *replq, *iter;
373 struct rt_private *prv = rt_priv(ops);
374 const struct rt_unit *svc;
375 const struct rt_dom *sdom;
376 unsigned long flags;
377
378 spin_lock_irqsave(&prv->lock, flags);
379
380 if ( list_empty(&prv->sdom) )
381 goto out;
382
383 runq = rt_runq(ops);
384 depletedq = rt_depletedq(ops);
385 replq = rt_replq(ops);
386
387 printk("Global RunQueue info:\n");
388 list_for_each ( iter, runq )
389 {
390 svc = q_elem(iter);
391 rt_dump_unit(ops, svc);
392 }
393
394 printk("Global DepletedQueue info:\n");
395 list_for_each ( iter, depletedq )
396 {
397 svc = q_elem(iter);
398 rt_dump_unit(ops, svc);
399 }
400
401 printk("Global Replenishment Events info:\n");
402 list_for_each ( iter, replq )
403 {
404 svc = replq_elem(iter);
405 rt_dump_unit(ops, svc);
406 }
407
408 printk("Domain info:\n");
409 list_for_each ( iter, &prv->sdom )
410 {
411 const struct sched_unit *unit;
412
413 sdom = list_entry(iter, struct rt_dom, sdom_elem);
414 printk("\tdomain: %d\n", sdom->dom->domain_id);
415
416 for_each_sched_unit ( sdom->dom, unit )
417 {
418 svc = rt_unit(unit);
419 rt_dump_unit(ops, svc);
420 }
421 }
422
423 out:
424 spin_unlock_irqrestore(&prv->lock, flags);
425 }
426
427 /*
428 * update deadline and budget when now >= cur_deadline
429 * it needs to be updated to the deadline of the current period
430 */
431 static void
rt_update_deadline(s_time_t now,struct rt_unit * svc)432 rt_update_deadline(s_time_t now, struct rt_unit *svc)
433 {
434 ASSERT(now >= svc->cur_deadline);
435 ASSERT(svc->period != 0);
436
437 if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
438 {
439 do
440 svc->cur_deadline += svc->period;
441 while ( svc->cur_deadline <= now );
442 }
443 else
444 {
445 long count = ((now - svc->cur_deadline) / svc->period) + 1;
446 svc->cur_deadline += count * svc->period;
447 }
448
449 /*
450 * svc may be scheduled to run immediately after it misses deadline
451 * Then rt_update_deadline is called before rt_schedule, which
452 * should only deduct the time spent in current period from the budget
453 */
454 svc->last_start = now;
455 svc->cur_budget = svc->budget;
456 svc->priority_level = 0;
457
458 /* TRACE */
459 {
460 struct __packed {
461 unsigned unit:16, dom:16;
462 unsigned priority_level;
463 uint64_t cur_deadline, cur_budget;
464 } d;
465 d.dom = svc->unit->domain->domain_id;
466 d.unit = svc->unit->unit_id;
467 d.priority_level = svc->priority_level;
468 d.cur_deadline = (uint64_t) svc->cur_deadline;
469 d.cur_budget = (uint64_t) svc->cur_budget;
470 trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
471 sizeof(d),
472 (unsigned char *) &d);
473 }
474
475 return;
476 }
477
478 /*
479 * Helpers for removing and inserting an unit in a queue
480 * that is being kept ordered by the units' deadlines (as EDF
481 * mandates).
482 *
483 * For callers' convenience, the unit removing helper returns
484 * true if the unit removed was the one at the front of the
485 * queue; similarly, the inserting helper returns true if the
486 * inserted ended at the front of the queue (i.e., in both
487 * cases, if the unit with the earliest deadline is what we
488 * are dealing with).
489 */
490 static inline bool
deadline_queue_remove(struct list_head * queue,struct list_head * elem)491 deadline_queue_remove(struct list_head *queue, struct list_head *elem)
492 {
493 bool first = false;
494
495 if ( queue->next != elem )
496 first = true;
497
498 list_del_init(elem);
499 return !first;
500 }
501
502 static inline bool
deadline_queue_insert(struct rt_unit * (* qelem)(struct list_head *),struct rt_unit * svc,struct list_head * elem,struct list_head * queue)503 deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *),
504 struct rt_unit *svc, struct list_head *elem,
505 struct list_head *queue)
506 {
507 struct list_head *iter;
508 bool first = true;
509
510 list_for_each ( iter, queue )
511 {
512 const struct rt_unit * iter_svc = (*qelem)(iter);
513 if ( compare_unit_priority(svc, iter_svc) > 0 )
514 break;
515 first = false;
516 }
517 list_add_tail(elem, iter);
518 return first;
519 }
520 #define deadline_runq_insert(...) \
521 deadline_queue_insert(&q_elem, ##__VA_ARGS__)
522 #define deadline_replq_insert(...) \
523 deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
524
525 static inline void
q_remove(struct rt_unit * svc)526 q_remove(struct rt_unit *svc)
527 {
528 ASSERT( unit_on_q(svc) );
529 list_del_init(&svc->q_elem);
530 }
531
532 static inline void
replq_remove(const struct scheduler * ops,struct rt_unit * svc)533 replq_remove(const struct scheduler *ops, struct rt_unit *svc)
534 {
535 struct rt_private *prv = rt_priv(ops);
536 struct list_head *replq = rt_replq(ops);
537
538 ASSERT( unit_on_replq(svc) );
539
540 if ( deadline_queue_remove(replq, &svc->replq_elem) )
541 {
542 /*
543 * The replenishment timer needs to be set to fire when a
544 * replenishment for the unit at the front of the replenishment
545 * queue is due. If it is such unit that we just removed, we may
546 * need to reprogram the timer.
547 */
548 if ( !list_empty(replq) )
549 {
550 const struct rt_unit *svc_next = replq_elem(replq->next);
551 set_timer(&prv->repl_timer, svc_next->cur_deadline);
552 }
553 else
554 stop_timer(&prv->repl_timer);
555 }
556 }
557
558 /*
559 * Insert svc with budget in RunQ according to EDF:
560 * units with smaller deadlines go first.
561 * Insert svc without budget in DepletedQ unsorted;
562 */
563 static void
runq_insert(const struct scheduler * ops,struct rt_unit * svc)564 runq_insert(const struct scheduler *ops, struct rt_unit *svc)
565 {
566 struct rt_private *prv = rt_priv(ops);
567 struct list_head *runq = rt_runq(ops);
568
569 ASSERT( spin_is_locked(&prv->lock) );
570 ASSERT( !unit_on_q(svc) );
571 ASSERT( unit_on_replq(svc) );
572
573 /* add svc to runq if svc still has budget or its extratime is set */
574 if ( svc->cur_budget > 0 ||
575 has_extratime(svc) )
576 deadline_runq_insert(svc, &svc->q_elem, runq);
577 else
578 list_add(&svc->q_elem, &prv->depletedq);
579 }
580
581 static void
replq_insert(const struct scheduler * ops,struct rt_unit * svc)582 replq_insert(const struct scheduler *ops, struct rt_unit *svc)
583 {
584 struct list_head *replq = rt_replq(ops);
585 struct rt_private *prv = rt_priv(ops);
586
587 ASSERT( !unit_on_replq(svc) );
588
589 /*
590 * The timer may be re-programmed if svc is inserted
591 * at the front of the event list.
592 */
593 if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
594 set_timer(&prv->repl_timer, svc->cur_deadline);
595 }
596
597 /*
598 * Removes and re-inserts an event to the replenishment queue.
599 * The aim is to update its position inside the queue, as its
600 * deadline (and hence its replenishment time) could have
601 * changed.
602 */
603 static void
replq_reinsert(const struct scheduler * ops,struct rt_unit * svc)604 replq_reinsert(const struct scheduler *ops, struct rt_unit *svc)
605 {
606 struct list_head *replq = rt_replq(ops);
607 const struct rt_unit *rearm_svc = svc;
608 bool rearm = false;
609
610 ASSERT( unit_on_replq(svc) );
611
612 /*
613 * If svc was at the front of the replenishment queue, we certainly
614 * need to re-program the timer, and we want to use the deadline of
615 * the unit which is now at the front of the queue (which may still
616 * be svc or not).
617 *
618 * We may also need to re-program, if svc has been put at the front
619 * of the replenishment queue when being re-inserted.
620 */
621 if ( deadline_queue_remove(replq, &svc->replq_elem) )
622 {
623 deadline_replq_insert(svc, &svc->replq_elem, replq);
624 rearm_svc = replq_elem(replq->next);
625 rearm = true;
626 }
627 else
628 rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
629
630 if ( rearm )
631 set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
632 }
633
634 /*
635 * Pick a valid resource for the unit vc
636 * Valid resource of an unit is intesection of unit's affinity
637 * and available resources
638 */
639 static struct sched_resource *
rt_res_pick_locked(const struct sched_unit * unit,unsigned int locked_cpu)640 rt_res_pick_locked(const struct sched_unit *unit, unsigned int locked_cpu)
641 {
642 cpumask_t *cpus = cpumask_scratch_cpu(locked_cpu);
643 const cpumask_t *online;
644 int cpu;
645
646 online = cpupool_domain_master_cpumask(unit->domain);
647 cpumask_and(cpus, online, unit->cpu_hard_affinity);
648
649 cpu = cpumask_test_cpu(sched_unit_master(unit), cpus)
650 ? sched_unit_master(unit)
651 : cpumask_cycle(sched_unit_master(unit), cpus);
652 ASSERT( !cpumask_empty(cpus) && cpumask_test_cpu(cpu, cpus) );
653
654 return get_sched_res(cpu);
655 }
656
657 /*
658 * Pick a valid resource for the unit vc
659 * Valid resource of an unit is intesection of unit's affinity
660 * and available resources
661 */
662 static struct sched_resource *
rt_res_pick(const struct scheduler * ops,const struct sched_unit * unit)663 rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
664 {
665 struct sched_resource *res;
666
667 res = rt_res_pick_locked(unit, unit->res->master_cpu);
668
669 return res;
670 }
671
672 /*
673 * Init/Free related code
674 */
675 static int
rt_init(struct scheduler * ops)676 rt_init(struct scheduler *ops)
677 {
678 int rc = -ENOMEM;
679 struct rt_private *prv = xzalloc(struct rt_private);
680
681 printk("Initializing RTDS scheduler\n"
682 "WARNING: This is experimental software in development.\n"
683 "Use at your own risk.\n");
684
685 if ( prv == NULL )
686 goto err;
687
688 spin_lock_init(&prv->lock);
689 INIT_LIST_HEAD(&prv->sdom);
690 INIT_LIST_HEAD(&prv->runq);
691 INIT_LIST_HEAD(&prv->depletedq);
692 INIT_LIST_HEAD(&prv->replq);
693
694 ops->sched_data = prv;
695 rc = 0;
696
697 err:
698 if ( rc )
699 xfree(prv);
700
701 return rc;
702 }
703
704 static void
rt_deinit(struct scheduler * ops)705 rt_deinit(struct scheduler *ops)
706 {
707 struct rt_private *prv = rt_priv(ops);
708
709 ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid ||
710 prv->repl_timer.status == TIMER_STATUS_killed);
711
712 ops->sched_data = NULL;
713 xfree(prv);
714 }
715
716 /* Change the scheduler of cpu to us (RTDS). */
717 static spinlock_t *
rt_switch_sched(struct scheduler * new_ops,unsigned int cpu,void * pdata,void * vdata)718 rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
719 void *pdata, void *vdata)
720 {
721 struct rt_private *prv = rt_priv(new_ops);
722 struct rt_unit *svc = vdata;
723
724 ASSERT(!pdata && svc && is_idle_unit(svc->unit));
725
726 /*
727 * We are holding the runqueue lock already (it's been taken in
728 * schedule_cpu_switch()). It's actually the runqueue lock of
729 * another scheduler, but that is how things need to be, for
730 * preventing races.
731 */
732 ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock);
733
734 /*
735 * If we are the absolute first cpu being switched toward this
736 * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
737 * first one that is added back to the cpupool that had all its cpus
738 * removed (in which case we'll see TIMER_STATUS_killed), it's our
739 * job to (re)initialize the timer.
740 */
741 if ( prv->repl_timer.status == TIMER_STATUS_invalid ||
742 prv->repl_timer.status == TIMER_STATUS_killed )
743 {
744 init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu);
745 dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
746 }
747
748 sched_idle_unit(cpu)->priv = vdata;
749
750 return &prv->lock;
751 }
752
753 static void
rt_deinit_pdata(const struct scheduler * ops,void * pcpu,int cpu)754 rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
755 {
756 unsigned long flags;
757 struct rt_private *prv = rt_priv(ops);
758
759 spin_lock_irqsave(&prv->lock, flags);
760
761 if ( prv->repl_timer.cpu == cpu )
762 {
763 cpumask_t *online = get_sched_res(cpu)->cpupool->res_valid;
764 unsigned int new_cpu = cpumask_cycle(cpu, online);
765
766 /*
767 * Make sure the timer run on one of the cpus that are still available
768 * to this scheduler. If there aren't any left, it means it's the time
769 * to just kill it.
770 */
771 if ( new_cpu >= nr_cpu_ids )
772 {
773 kill_timer(&prv->repl_timer);
774 dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
775 }
776 else
777 {
778 migrate_timer(&prv->repl_timer, new_cpu);
779 }
780 }
781
782 spin_unlock_irqrestore(&prv->lock, flags);
783 }
784
785 static void *
rt_alloc_domdata(const struct scheduler * ops,struct domain * dom)786 rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
787 {
788 unsigned long flags;
789 struct rt_dom *sdom;
790 struct rt_private * prv = rt_priv(ops);
791
792 sdom = xzalloc(struct rt_dom);
793 if ( sdom == NULL )
794 return ERR_PTR(-ENOMEM);
795
796 INIT_LIST_HEAD(&sdom->sdom_elem);
797 sdom->dom = dom;
798
799 /* spinlock here to insert the dom */
800 spin_lock_irqsave(&prv->lock, flags);
801 list_add_tail(&sdom->sdom_elem, &(prv->sdom));
802 spin_unlock_irqrestore(&prv->lock, flags);
803
804 return sdom;
805 }
806
807 static void
rt_free_domdata(const struct scheduler * ops,void * data)808 rt_free_domdata(const struct scheduler *ops, void *data)
809 {
810 struct rt_dom *sdom = data;
811 struct rt_private *prv = rt_priv(ops);
812
813 if ( sdom )
814 {
815 unsigned long flags;
816
817 spin_lock_irqsave(&prv->lock, flags);
818 list_del_init(&sdom->sdom_elem);
819 spin_unlock_irqrestore(&prv->lock, flags);
820
821 xfree(sdom);
822 }
823 }
824
825 static void *
rt_alloc_udata(const struct scheduler * ops,struct sched_unit * unit,void * dd)826 rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd)
827 {
828 struct rt_unit *svc;
829
830 /* Allocate per-UNIT info */
831 svc = xzalloc(struct rt_unit);
832 if ( svc == NULL )
833 return NULL;
834
835 INIT_LIST_HEAD(&svc->q_elem);
836 INIT_LIST_HEAD(&svc->replq_elem);
837 svc->flags = 0U;
838 svc->sdom = dd;
839 svc->unit = unit;
840 svc->last_start = 0;
841
842 __set_bit(__RTDS_extratime, &svc->flags);
843 svc->priority_level = 0;
844 svc->period = RTDS_DEFAULT_PERIOD;
845 if ( !is_idle_unit(unit) )
846 svc->budget = RTDS_DEFAULT_BUDGET;
847
848 SCHED_STAT_CRANK(unit_alloc);
849
850 return svc;
851 }
852
853 static void
rt_free_udata(const struct scheduler * ops,void * priv)854 rt_free_udata(const struct scheduler *ops, void *priv)
855 {
856 struct rt_unit *svc = priv;
857
858 xfree(svc);
859 }
860
861 /*
862 * It is called in sched_move_domain() and sched_init_vcpu
863 * in schedule.c.
864 * When move a domain to a new cpupool.
865 * It inserts units of moving domain to the scheduler's RunQ in
866 * dest. cpupool.
867 */
868 static void
rt_unit_insert(const struct scheduler * ops,struct sched_unit * unit)869 rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
870 {
871 struct rt_unit *svc = rt_unit(unit);
872 s_time_t now;
873 spinlock_t *lock;
874 unsigned int cpu = smp_processor_id();
875
876 BUG_ON( is_idle_unit(unit) );
877
878 /* This is safe because unit isn't yet being scheduled */
879 lock = pcpu_schedule_lock_irq(cpu);
880 sched_set_res(unit, rt_res_pick_locked(unit, cpu));
881 pcpu_schedule_unlock_irq(lock, cpu);
882
883 lock = unit_schedule_lock_irq(unit);
884
885 now = NOW();
886 if ( now >= svc->cur_deadline )
887 rt_update_deadline(now, svc);
888
889 if ( !unit_on_q(svc) && unit_runnable(unit) )
890 {
891 replq_insert(ops, svc);
892
893 if ( !unit->is_running )
894 runq_insert(ops, svc);
895 }
896 unit_schedule_unlock_irq(lock, unit);
897
898 SCHED_STAT_CRANK(unit_insert);
899 }
900
901 /*
902 * Remove rt_unit svc from the old scheduler in source cpupool.
903 */
904 static void
rt_unit_remove(const struct scheduler * ops,struct sched_unit * unit)905 rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
906 {
907 struct rt_unit * const svc = rt_unit(unit);
908 struct rt_dom * const sdom = svc->sdom;
909 spinlock_t *lock;
910
911 SCHED_STAT_CRANK(unit_remove);
912
913 BUG_ON( sdom == NULL );
914
915 lock = unit_schedule_lock_irq(unit);
916 if ( unit_on_q(svc) )
917 q_remove(svc);
918
919 if ( unit_on_replq(svc) )
920 replq_remove(ops,svc);
921
922 unit_schedule_unlock_irq(lock, unit);
923 }
924
925 /*
926 * Burn budget in nanosecond granularity
927 */
928 static void
burn_budget(const struct scheduler * ops,struct rt_unit * svc,s_time_t now)929 burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now)
930 {
931 s_time_t delta;
932
933 /* don't burn budget for idle UNIT */
934 if ( is_idle_unit(svc->unit) )
935 return;
936
937 /* burn at nanoseconds level */
938 delta = now - svc->last_start;
939 /*
940 * delta < 0 only happens in nested virtualization;
941 * TODO: how should we handle delta < 0 in a better way?
942 */
943 if ( delta < 0 )
944 {
945 printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
946 __func__, delta);
947 svc->last_start = now;
948 return;
949 }
950
951 svc->cur_budget -= delta;
952 svc->last_start = now;
953
954 if ( svc->cur_budget <= 0 )
955 {
956 if ( has_extratime(svc) )
957 {
958 svc->priority_level++;
959 svc->cur_budget = svc->budget;
960 }
961 else
962 {
963 svc->cur_budget = 0;
964 __set_bit(__RTDS_depleted, &svc->flags);
965 }
966 }
967
968 /* TRACE */
969 {
970 struct __packed {
971 unsigned unit:16, dom:16;
972 uint64_t cur_budget;
973 int delta;
974 unsigned priority_level;
975 bool has_extratime;
976 } d;
977 d.dom = svc->unit->domain->domain_id;
978 d.unit = svc->unit->unit_id;
979 d.cur_budget = (uint64_t) svc->cur_budget;
980 d.delta = delta;
981 d.priority_level = svc->priority_level;
982 d.has_extratime = svc->flags & RTDS_extratime;
983 trace_var(TRC_RTDS_BUDGET_BURN, 1,
984 sizeof(d),
985 (unsigned char *) &d);
986 }
987 }
988
989 /*
990 * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
991 * lock is grabbed before calling this function
992 */
993 static struct rt_unit *
runq_pick(const struct scheduler * ops,const cpumask_t * mask,unsigned int cpu)994 runq_pick(const struct scheduler *ops, const cpumask_t *mask, unsigned int cpu)
995 {
996 struct list_head *runq = rt_runq(ops);
997 struct list_head *iter;
998 struct rt_unit *svc = NULL;
999 struct rt_unit *iter_svc = NULL;
1000 cpumask_t *cpu_common = cpumask_scratch_cpu(cpu);
1001 const cpumask_t *online;
1002
1003 list_for_each ( iter, runq )
1004 {
1005 iter_svc = q_elem(iter);
1006
1007 /* mask cpu_hard_affinity & cpupool & mask */
1008 online = cpupool_domain_master_cpumask(iter_svc->unit->domain);
1009 cpumask_and(cpu_common, online, iter_svc->unit->cpu_hard_affinity);
1010 cpumask_and(cpu_common, mask, cpu_common);
1011 if ( cpumask_empty(cpu_common) )
1012 continue;
1013
1014 ASSERT( iter_svc->cur_budget > 0 );
1015
1016 svc = iter_svc;
1017 break;
1018 }
1019
1020 /* TRACE */
1021 {
1022 if( svc != NULL )
1023 {
1024 struct __packed {
1025 unsigned unit:16, dom:16;
1026 uint64_t cur_deadline, cur_budget;
1027 } d;
1028 d.dom = svc->unit->domain->domain_id;
1029 d.unit = svc->unit->unit_id;
1030 d.cur_deadline = (uint64_t) svc->cur_deadline;
1031 d.cur_budget = (uint64_t) svc->cur_budget;
1032 trace_var(TRC_RTDS_RUNQ_PICK, 1,
1033 sizeof(d),
1034 (unsigned char *) &d);
1035 }
1036 }
1037
1038 return svc;
1039 }
1040
1041 /*
1042 * schedule function for rt scheduler.
1043 * The lock is already grabbed in schedule.c, no need to lock here
1044 */
1045 static void
rt_schedule(const struct scheduler * ops,struct sched_unit * currunit,s_time_t now,bool tasklet_work_scheduled)1046 rt_schedule(const struct scheduler *ops, struct sched_unit *currunit,
1047 s_time_t now, bool tasklet_work_scheduled)
1048 {
1049 const unsigned int cur_cpu = smp_processor_id();
1050 const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
1051 struct rt_private *prv = rt_priv(ops);
1052 struct rt_unit *const scurr = rt_unit(currunit);
1053 struct rt_unit *snext = NULL;
1054 bool migrated = false;
1055
1056 /* TRACE */
1057 {
1058 struct __packed {
1059 unsigned cpu:16, tasklet:8, tickled:4, idle:4;
1060 } d;
1061 d.cpu = cur_cpu;
1062 d.tasklet = tasklet_work_scheduled;
1063 d.tickled = cpumask_test_cpu(sched_cpu, &prv->tickled);
1064 d.idle = is_idle_unit(currunit);
1065 trace_var(TRC_RTDS_SCHEDULE, 1,
1066 sizeof(d),
1067 (unsigned char *)&d);
1068 }
1069
1070 /* clear ticked bit now that we've been scheduled */
1071 cpumask_clear_cpu(sched_cpu, &prv->tickled);
1072
1073 /* burn_budget would return for IDLE UNIT */
1074 burn_budget(ops, scurr, now);
1075
1076 if ( tasklet_work_scheduled )
1077 {
1078 trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0, NULL);
1079 snext = rt_unit(sched_idle_unit(sched_cpu));
1080 }
1081 else
1082 {
1083 snext = runq_pick(ops, cpumask_of(sched_cpu), cur_cpu);
1084
1085 if ( snext == NULL )
1086 snext = rt_unit(sched_idle_unit(sched_cpu));
1087 else if ( !unit_runnable_state(snext->unit) )
1088 {
1089 q_remove(snext);
1090 snext = rt_unit(sched_idle_unit(sched_cpu));
1091 }
1092
1093 /* if scurr has higher priority and budget, still pick scurr */
1094 if ( !is_idle_unit(currunit) &&
1095 unit_runnable_state(currunit) &&
1096 scurr->cur_budget > 0 &&
1097 ( is_idle_unit(snext->unit) ||
1098 compare_unit_priority(scurr, snext) > 0 ) )
1099 snext = scurr;
1100 }
1101
1102 if ( snext != scurr &&
1103 !is_idle_unit(currunit) &&
1104 unit_runnable(currunit) )
1105 __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
1106
1107 snext->last_start = now;
1108 currunit->next_time = -1; /* if an idle unit is picked */
1109 if ( !is_idle_unit(snext->unit) )
1110 {
1111 if ( snext != scurr )
1112 {
1113 q_remove(snext);
1114 __set_bit(__RTDS_scheduled, &snext->flags);
1115 }
1116 if ( sched_unit_master(snext->unit) != sched_cpu )
1117 {
1118 sched_set_res(snext->unit, get_sched_res(sched_cpu));
1119 migrated = true;
1120 }
1121 /* Invoke the scheduler next time. */
1122 currunit->next_time = snext->cur_budget;
1123 }
1124 currunit->next_task = snext->unit;
1125 snext->unit->migrated = migrated;
1126 }
1127
1128 /*
1129 * Remove UNIT from RunQ
1130 * The lock is already grabbed in schedule.c, no need to lock here
1131 */
1132 static void
rt_unit_sleep(const struct scheduler * ops,struct sched_unit * unit)1133 rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
1134 {
1135 struct rt_unit * const svc = rt_unit(unit);
1136
1137 BUG_ON( is_idle_unit(unit) );
1138 SCHED_STAT_CRANK(unit_sleep);
1139
1140 if ( curr_on_cpu(sched_unit_master(unit)) == unit )
1141 cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
1142 else if ( unit_on_q(svc) )
1143 {
1144 q_remove(svc);
1145 replq_remove(ops, svc);
1146 }
1147 else if ( svc->flags & RTDS_delayed_runq_add )
1148 __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
1149 }
1150
1151 /*
1152 * Pick a cpu where to run an unit,
1153 * possibly kicking out the unit running there
1154 * Called by wake() and context_saved()
1155 * We have a running candidate here, the kick logic is:
1156 * Among all the cpus that are within the cpu affinity
1157 * 1) if there are any idle CPUs, kick one.
1158 For cache benefit, we check new->cpu as first
1159 * 2) now all pcpus are busy;
1160 * among all the running units, pick lowest priority one
1161 * if snext has higher priority, kick it.
1162 *
1163 * TODO:
1164 * 1) what if these two units belongs to the same domain?
1165 * replace an unit belonging to the same domain introduces more overhead
1166 *
1167 * lock is grabbed before calling this function
1168 */
1169 static void
runq_tickle(const struct scheduler * ops,const struct rt_unit * new)1170 runq_tickle(const struct scheduler *ops, const struct rt_unit *new)
1171 {
1172 struct rt_private *prv = rt_priv(ops);
1173 const struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */
1174 const struct rt_unit *iter_svc;
1175 const struct sched_unit *iter_unit;
1176 int cpu = 0, cpu_to_tickle = 0;
1177 cpumask_t *not_tickled = cpumask_scratch_cpu(smp_processor_id());
1178 const cpumask_t *online;
1179
1180 if ( new == NULL || is_idle_unit(new->unit) )
1181 return;
1182
1183 online = cpupool_domain_master_cpumask(new->unit->domain);
1184 cpumask_and(not_tickled, online, new->unit->cpu_hard_affinity);
1185 cpumask_andnot(not_tickled, not_tickled, &prv->tickled);
1186
1187 /*
1188 * 1) If there are any idle CPUs, kick one.
1189 * For cache benefit,we first search new->cpu.
1190 * The same loop also find the one with lowest priority.
1191 */
1192 cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), not_tickled);
1193 while ( cpu!= nr_cpu_ids )
1194 {
1195 iter_unit = curr_on_cpu(cpu);
1196 if ( is_idle_unit(iter_unit) )
1197 {
1198 SCHED_STAT_CRANK(tickled_idle_cpu);
1199 cpu_to_tickle = cpu;
1200 goto out;
1201 }
1202 iter_svc = rt_unit(iter_unit);
1203 if ( latest_deadline_unit == NULL ||
1204 compare_unit_priority(iter_svc, latest_deadline_unit) < 0 )
1205 latest_deadline_unit = iter_svc;
1206
1207 cpumask_clear_cpu(cpu, not_tickled);
1208 cpu = cpumask_cycle(cpu, not_tickled);
1209 }
1210
1211 /* 2) candicate has higher priority, kick out lowest priority unit */
1212 if ( latest_deadline_unit != NULL &&
1213 compare_unit_priority(latest_deadline_unit, new) < 0 )
1214 {
1215 SCHED_STAT_CRANK(tickled_busy_cpu);
1216 cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit);
1217 goto out;
1218 }
1219
1220 /* didn't tickle any cpu */
1221 SCHED_STAT_CRANK(tickled_no_cpu);
1222 return;
1223 out:
1224 /* TRACE */
1225 {
1226 struct {
1227 unsigned cpu:16, pad:16;
1228 } d;
1229 d.cpu = cpu_to_tickle;
1230 d.pad = 0;
1231 trace_var(TRC_RTDS_TICKLE, 1,
1232 sizeof(d),
1233 (unsigned char *)&d);
1234 }
1235
1236 cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
1237 cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
1238 return;
1239 }
1240
1241 /*
1242 * Should always wake up runnable unit, put it back to RunQ.
1243 * Check priority to raise interrupt
1244 * The lock is already grabbed in schedule.c, no need to lock here
1245 * TODO: what if these two units belongs to the same domain?
1246 */
1247 static void
rt_unit_wake(const struct scheduler * ops,struct sched_unit * unit)1248 rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
1249 {
1250 struct rt_unit * const svc = rt_unit(unit);
1251 s_time_t now;
1252 bool missed;
1253
1254 BUG_ON( is_idle_unit(unit) );
1255
1256 if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
1257 {
1258 SCHED_STAT_CRANK(unit_wake_running);
1259 return;
1260 }
1261
1262 /* on RunQ/DepletedQ, just update info is ok */
1263 if ( unlikely(unit_on_q(svc)) )
1264 {
1265 SCHED_STAT_CRANK(unit_wake_onrunq);
1266 return;
1267 }
1268
1269 if ( likely(unit_runnable(unit)) )
1270 SCHED_STAT_CRANK(unit_wake_runnable);
1271 else
1272 SCHED_STAT_CRANK(unit_wake_not_runnable);
1273
1274 /*
1275 * If a deadline passed while svc was asleep/blocked, we need new
1276 * scheduling parameters (a new deadline and full budget).
1277 */
1278 now = NOW();
1279
1280 missed = ( now >= svc->cur_deadline );
1281 if ( missed )
1282 rt_update_deadline(now, svc);
1283
1284 /*
1285 * If context hasn't been saved for this unit yet, we can't put it on
1286 * the run-queue/depleted-queue. Instead, we set the appropriate flag,
1287 * the unit will be put back on queue after the context has been saved
1288 * (in rt_context_save()).
1289 */
1290 if ( unlikely(svc->flags & RTDS_scheduled) )
1291 {
1292 __set_bit(__RTDS_delayed_runq_add, &svc->flags);
1293 /*
1294 * The unit is waking up already, and we didn't even had the time to
1295 * remove its next replenishment event from the replenishment queue
1296 * when it blocked! No big deal. If we did not miss the deadline in
1297 * the meantime, let's just leave it there. If we did, let's remove it
1298 * and queue a new one (to occur at our new deadline).
1299 */
1300 if ( missed )
1301 replq_reinsert(ops, svc);
1302 return;
1303 }
1304
1305 /* Replenishment event got cancelled when we blocked. Add it back. */
1306 replq_insert(ops, svc);
1307 /* insert svc to runq/depletedq because svc is not in queue now */
1308 runq_insert(ops, svc);
1309
1310 runq_tickle(ops, svc);
1311 }
1312
1313 /*
1314 * scurr has finished context switch, insert it back to the RunQ,
1315 * and then pick the highest priority unit from runq to run
1316 */
1317 static void
rt_context_saved(const struct scheduler * ops,struct sched_unit * unit)1318 rt_context_saved(const struct scheduler *ops, struct sched_unit *unit)
1319 {
1320 struct rt_unit *svc = rt_unit(unit);
1321 spinlock_t *lock = unit_schedule_lock_irq(unit);
1322
1323 __clear_bit(__RTDS_scheduled, &svc->flags);
1324 /* not insert idle unit to runq */
1325 if ( is_idle_unit(unit) )
1326 goto out;
1327
1328 if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
1329 likely(unit_runnable(unit)) )
1330 {
1331 runq_insert(ops, svc);
1332 runq_tickle(ops, svc);
1333 }
1334 else
1335 replq_remove(ops, svc);
1336
1337 out:
1338 unit_schedule_unlock_irq(lock, unit);
1339 }
1340
1341 /*
1342 * set/get each unit info of each domain
1343 */
1344 static int
rt_dom_cntl(const struct scheduler * ops,struct domain * d,struct xen_domctl_scheduler_op * op)1345 rt_dom_cntl(
1346 const struct scheduler *ops,
1347 struct domain *d,
1348 struct xen_domctl_scheduler_op *op)
1349 {
1350 struct rt_private *prv = rt_priv(ops);
1351 struct rt_unit *svc;
1352 const struct sched_unit *unit;
1353 unsigned long flags;
1354 int rc = 0;
1355 struct xen_domctl_schedparam_vcpu local_sched;
1356 s_time_t period, budget;
1357 uint32_t index = 0;
1358
1359 switch ( op->cmd )
1360 {
1361 case XEN_DOMCTL_SCHEDOP_getinfo:
1362 /* Return the default parameters. */
1363 op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
1364 op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
1365 break;
1366 case XEN_DOMCTL_SCHEDOP_putinfo:
1367 if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
1368 {
1369 rc = -EINVAL;
1370 break;
1371 }
1372 spin_lock_irqsave(&prv->lock, flags);
1373 for_each_sched_unit ( d, unit )
1374 {
1375 svc = rt_unit(unit);
1376 svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
1377 svc->budget = MICROSECS(op->u.rtds.budget);
1378 }
1379 spin_unlock_irqrestore(&prv->lock, flags);
1380 break;
1381 case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
1382 case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
1383 while ( index < op->u.v.nr_vcpus )
1384 {
1385 if ( copy_from_guest_offset(&local_sched,
1386 op->u.v.vcpus, index, 1) )
1387 {
1388 rc = -EFAULT;
1389 break;
1390 }
1391 if ( local_sched.vcpuid >= d->max_vcpus ||
1392 d->vcpu[local_sched.vcpuid] == NULL )
1393 {
1394 rc = -EINVAL;
1395 break;
1396 }
1397
1398 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
1399 {
1400 spin_lock_irqsave(&prv->lock, flags);
1401 svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
1402 local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
1403 local_sched.u.rtds.period = svc->period / MICROSECS(1);
1404 if ( has_extratime(svc) )
1405 local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
1406 else
1407 local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
1408 spin_unlock_irqrestore(&prv->lock, flags);
1409
1410 if ( copy_to_guest_offset(op->u.v.vcpus, index,
1411 &local_sched, 1) )
1412 {
1413 rc = -EFAULT;
1414 break;
1415 }
1416 }
1417 else
1418 {
1419 period = MICROSECS(local_sched.u.rtds.period);
1420 budget = MICROSECS(local_sched.u.rtds.budget);
1421 if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
1422 budget > period || period < RTDS_MIN_PERIOD )
1423 {
1424 rc = -EINVAL;
1425 break;
1426 }
1427
1428 spin_lock_irqsave(&prv->lock, flags);
1429 svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
1430 svc->period = period;
1431 svc->budget = budget;
1432 if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
1433 __set_bit(__RTDS_extratime, &svc->flags);
1434 else
1435 __clear_bit(__RTDS_extratime, &svc->flags);
1436 spin_unlock_irqrestore(&prv->lock, flags);
1437 }
1438 /* Process a most 64 vCPUs without checking for preemptions. */
1439 if ( (++index > 63) && hypercall_preempt_check() )
1440 break;
1441 }
1442 if ( !rc )
1443 /* notify upper caller how many units have been processed. */
1444 op->u.v.nr_vcpus = index;
1445 break;
1446 }
1447
1448 return rc;
1449 }
1450
1451 /*
1452 * The replenishment timer handler picks units
1453 * from the replq and does the actual replenishment.
1454 */
repl_timer_handler(void * data)1455 static void repl_timer_handler(void *data){
1456 s_time_t now;
1457 const struct scheduler *ops = data;
1458 struct rt_private *prv = rt_priv(ops);
1459 struct list_head *replq = rt_replq(ops);
1460 struct list_head *runq = rt_runq(ops);
1461 struct list_head *iter, *tmp;
1462 struct rt_unit *svc;
1463 LIST_HEAD(tmp_replq);
1464
1465 spin_lock_irq(&prv->lock);
1466
1467 now = NOW();
1468
1469 /*
1470 * Do the replenishment and move replenished units
1471 * to the temporary list to tickle.
1472 * If svc is on run queue, we need to put it at
1473 * the correct place since its deadline changes.
1474 */
1475 list_for_each_safe ( iter, tmp, replq )
1476 {
1477 svc = replq_elem(iter);
1478
1479 if ( now < svc->cur_deadline )
1480 break;
1481
1482 list_del(&svc->replq_elem);
1483 rt_update_deadline(now, svc);
1484 list_add(&svc->replq_elem, &tmp_replq);
1485
1486 if ( unit_on_q(svc) )
1487 {
1488 q_remove(svc);
1489 runq_insert(ops, svc);
1490 }
1491 }
1492
1493 /*
1494 * Iterate through the list of updated units.
1495 * If an updated unit is running, tickle the head of the
1496 * runqueue if it has a higher priority.
1497 * If an updated unit was depleted and on the runqueue, tickle it.
1498 * Finally, reinsert the units back to replenishement events list.
1499 */
1500 list_for_each_safe ( iter, tmp, &tmp_replq )
1501 {
1502 svc = replq_elem(iter);
1503
1504 if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit &&
1505 !list_empty(runq) )
1506 {
1507 struct rt_unit *next_on_runq = q_elem(runq->next);
1508
1509 if ( compare_unit_priority(svc, next_on_runq) < 0 )
1510 runq_tickle(ops, next_on_runq);
1511 }
1512 else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
1513 unit_on_q(svc) )
1514 runq_tickle(ops, svc);
1515
1516 list_del(&svc->replq_elem);
1517 deadline_replq_insert(svc, &svc->replq_elem, replq);
1518 }
1519
1520 /*
1521 * If there are units left in the replenishment event list,
1522 * set the next replenishment to happen at the deadline of
1523 * the one in the front.
1524 */
1525 if ( !list_empty(replq) )
1526 set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline);
1527
1528 spin_unlock_irq(&prv->lock);
1529 }
1530
1531 static const struct scheduler sched_rtds_def = {
1532 .name = "SMP RTDS Scheduler",
1533 .opt_name = "rtds",
1534 .sched_id = XEN_SCHEDULER_RTDS,
1535 .sched_data = NULL,
1536
1537 .dump_cpu_state = rt_dump_pcpu,
1538 .dump_settings = rt_dump,
1539 .init = rt_init,
1540 .deinit = rt_deinit,
1541 .switch_sched = rt_switch_sched,
1542 .deinit_pdata = rt_deinit_pdata,
1543 .alloc_domdata = rt_alloc_domdata,
1544 .free_domdata = rt_free_domdata,
1545 .alloc_udata = rt_alloc_udata,
1546 .free_udata = rt_free_udata,
1547 .insert_unit = rt_unit_insert,
1548 .remove_unit = rt_unit_remove,
1549
1550 .adjust = rt_dom_cntl,
1551
1552 .pick_resource = rt_res_pick,
1553 .do_schedule = rt_schedule,
1554 .sleep = rt_unit_sleep,
1555 .wake = rt_unit_wake,
1556 .context_saved = rt_context_saved,
1557 };
1558
1559 REGISTER_SCHEDULER(sched_rtds_def);
1560