1 /******************************************************************************
2  * arch/x86/time.c
3  *
4  * Per-CPU time calibration and management.
5  *
6  * Copyright (c) 2002-2005, K A Fraser
7  *
8  * Portions from Linux are:
9  * Copyright (c) 1991, 1992, 1995  Linus Torvalds
10  */
11 
12 #include <xen/errno.h>
13 #include <xen/event.h>
14 #include <xen/sched.h>
15 #include <xen/lib.h>
16 #include <xen/init.h>
17 #include <xen/param.h>
18 #include <xen/time.h>
19 #include <xen/timer.h>
20 #include <xen/smp.h>
21 #include <xen/irq.h>
22 #include <xen/pci_ids.h>
23 #include <xen/softirq.h>
24 #include <xen/efi.h>
25 #include <xen/cpuidle.h>
26 #include <xen/symbols.h>
27 #include <xen/keyhandler.h>
28 #include <xen/guest_access.h>
29 #include <asm/io.h>
30 #include <asm/iocap.h>
31 #include <asm/msr.h>
32 #include <asm/mpspec.h>
33 #include <asm/processor.h>
34 #include <asm/fixmap.h>
35 #include <asm/guest.h>
36 #include <asm/mc146818rtc.h>
37 #include <asm/div64.h>
38 #include <asm/acpi.h>
39 #include <asm/hpet.h>
40 #include <io_ports.h>
41 #include <asm/setup.h> /* for early_time_init */
42 #include <public/arch-x86/cpuid.h>
43 
44 /* opt_clocksource: Force clocksource to one of: pit, hpet, acpi. */
45 static char __initdata opt_clocksource[10];
46 string_param("clocksource", opt_clocksource);
47 
48 unsigned long __read_mostly cpu_khz;  /* CPU clock frequency in kHz. */
49 DEFINE_SPINLOCK(rtc_lock);
50 unsigned long pit0_ticks;
51 
52 struct cpu_time_stamp {
53     u64 local_tsc;
54     s_time_t local_stime;
55     s_time_t master_stime;
56 };
57 
58 struct cpu_time {
59     struct cpu_time_stamp stamp;
60     struct time_scale tsc_scale;
61 };
62 
63 struct platform_timesource {
64     char *id;
65     char *name;
66     u64 frequency;
67     u64 (*read_counter)(void);
68     s64 (*init)(struct platform_timesource *);
69     void (*resume)(struct platform_timesource *);
70     int counter_bits;
71 };
72 
73 static DEFINE_PER_CPU(struct cpu_time, cpu_time);
74 
75 /* Calibrate all CPUs to platform timer every EPOCH. */
76 #define EPOCH MILLISECS(1000)
77 static struct timer calibration_timer;
78 
79 /*
80  * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
81  * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
82  * softirq handling will happen in time.
83  *
84  * The pit_lock protects the 16- and 32-bit stamp fields as well as the
85  */
86 static DEFINE_SPINLOCK(pit_lock);
87 static u16 pit_stamp16;
88 static u32 pit_stamp32;
89 static bool __read_mostly using_pit;
90 
91 /* Boot timestamp, filled in head.S */
92 u64 __initdata boot_tsc_stamp;
93 
94 /* Per-socket TSC_ADJUST values, for secondary cores/threads to sync to. */
95 static uint64_t *__read_mostly tsc_adjust;
96 
97 /*
98  * 32-bit division of integer dividend and integer divisor yielding
99  * 32-bit fractional quotient.
100  */
div_frac(u32 dividend,u32 divisor)101 static inline u32 div_frac(u32 dividend, u32 divisor)
102 {
103     u32 quotient, remainder;
104     ASSERT(dividend < divisor);
105     asm (
106         "divl %4"
107         : "=a" (quotient), "=d" (remainder)
108         : "0" (0), "1" (dividend), "r" (divisor) );
109     return quotient;
110 }
111 
112 /*
113  * 32-bit multiplication of multiplicand and fractional multiplier
114  * yielding 32-bit product (radix point at same position as in multiplicand).
115  */
mul_frac(u32 multiplicand,u32 multiplier)116 static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
117 {
118     u32 product_int, product_frac;
119     asm (
120         "mul %3"
121         : "=a" (product_frac), "=d" (product_int)
122         : "0" (multiplicand), "r" (multiplier) );
123     return product_int;
124 }
125 
126 /*
127  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
128  * yielding a 64-bit result.
129  */
scale_delta(u64 delta,const struct time_scale * scale)130 u64 scale_delta(u64 delta, const struct time_scale *scale)
131 {
132     u64 product;
133 
134     if ( scale->shift < 0 )
135         delta >>= -scale->shift;
136     else
137         delta <<= scale->shift;
138 
139     asm (
140         "mulq %2 ; shrd $32,%1,%0"
141         : "=a" (product), "=d" (delta)
142         : "rm" (delta), "0" ((u64)scale->mul_frac) );
143 
144     return product;
145 }
146 
147 #define _TS_MUL_FRAC_IDENTITY 0x80000000UL
148 
149 /* Compute the reciprocal of the given time_scale. */
scale_reciprocal(struct time_scale scale)150 static inline struct time_scale scale_reciprocal(struct time_scale scale)
151 {
152     struct time_scale reciprocal;
153     u32 dividend;
154 
155     ASSERT(scale.mul_frac != 0);
156     dividend = _TS_MUL_FRAC_IDENTITY;
157     reciprocal.shift = 1 - scale.shift;
158     while ( unlikely(dividend >= scale.mul_frac) )
159     {
160         dividend >>= 1;
161         reciprocal.shift++;
162     }
163 
164     asm (
165         "divl %4"
166         : "=a" (reciprocal.mul_frac), "=d" (dividend)
167         : "0" (0), "1" (dividend), "r" (scale.mul_frac) );
168 
169     return reciprocal;
170 }
171 
172 /*
173  * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
174  * IPIs in place of local APIC timers
175  */
176 static cpumask_t pit_broadcast_mask;
177 
smp_send_timer_broadcast_ipi(void)178 static void smp_send_timer_broadcast_ipi(void)
179 {
180     int cpu = smp_processor_id();
181     cpumask_t mask;
182 
183     cpumask_and(&mask, &cpu_online_map, &pit_broadcast_mask);
184 
185     if ( cpumask_test_cpu(cpu, &mask) )
186     {
187         __cpumask_clear_cpu(cpu, &mask);
188         raise_softirq(TIMER_SOFTIRQ);
189     }
190 
191     if ( !cpumask_empty(&mask) )
192     {
193         cpumask_raise_softirq(&mask, TIMER_SOFTIRQ);
194     }
195 }
196 
timer_interrupt(int irq,void * dev_id,struct cpu_user_regs * regs)197 static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
198 {
199     ASSERT(local_irq_is_enabled());
200 
201     if ( hpet_legacy_irq_tick() )
202         return;
203 
204     /* Only for start-of-day interruopt tests in io_apic.c. */
205     pit0_ticks++;
206 
207     /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
208     if ( !cpu_has_apic )
209         raise_softirq(TIMER_SOFTIRQ);
210 
211     if ( xen_cpuidle )
212         smp_send_timer_broadcast_ipi();
213 
214     /* Emulate a 32-bit PIT counter. */
215     if ( using_pit )
216     {
217         u16 count;
218 
219         spin_lock_irq(&pit_lock);
220 
221         outb(0x80, PIT_MODE);
222         count  = inb(PIT_CH2);
223         count |= inb(PIT_CH2) << 8;
224 
225         pit_stamp32 += (u16)(pit_stamp16 - count);
226         pit_stamp16 = count;
227 
228         spin_unlock_irq(&pit_lock);
229     }
230 }
231 
232 static struct irqaction __read_mostly irq0 = {
233     timer_interrupt, "timer", NULL
234 };
235 
236 #define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */
237 #define CALIBRATE_FRAC  20      /* calibrate over 50ms */
238 #define CALIBRATE_VALUE(freq) (((freq) + CALIBRATE_FRAC / 2) / CALIBRATE_FRAC)
239 
preinit_pit(void)240 static void preinit_pit(void)
241 {
242     /* Set PIT channel 0 to HZ Hz. */
243 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
244     outb_p(0x34, PIT_MODE);        /* binary, mode 2, LSB/MSB, ch 0 */
245     outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
246     outb(LATCH >> 8, PIT_CH0);     /* MSB */
247 #undef LATCH
248 }
249 
set_time_scale(struct time_scale * ts,u64 ticks_per_sec)250 void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
251 {
252     u64 tps64 = ticks_per_sec;
253     u32 tps32;
254     int shift = 0;
255 
256     ASSERT(tps64 != 0);
257 
258     while ( tps64 > (MILLISECS(1000)*2) )
259     {
260         tps64 >>= 1;
261         shift--;
262     }
263 
264     tps32 = (u32)tps64;
265     while ( tps32 <= (u32)MILLISECS(1000) )
266     {
267         tps32 <<= 1;
268         shift++;
269     }
270 
271     ts->mul_frac = div_frac(MILLISECS(1000), tps32);
272     ts->shift    = shift;
273 }
274 
freq_string(u64 freq)275 static char *freq_string(u64 freq)
276 {
277     static char s[20];
278     unsigned int x, y;
279 
280     if ( do_div(freq, 1000) > 500 )
281         ++freq;
282     y = (unsigned int)do_div(freq, 1000);
283     x = (unsigned int)freq;
284     snprintf(s, sizeof(s), "%u.%03uMHz", x, y);
285     return s;
286 }
287 
288 /************************************************************
289  * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
290  */
291 
read_pit_count(void)292 static u64 read_pit_count(void)
293 {
294     u16 count16;
295     u32 count32;
296     unsigned long flags;
297 
298     spin_lock_irqsave(&pit_lock, flags);
299 
300     outb(0x80, PIT_MODE);
301     count16  = inb(PIT_CH2);
302     count16 |= inb(PIT_CH2) << 8;
303 
304     count32 = pit_stamp32 + (u16)(pit_stamp16 - count16);
305 
306     spin_unlock_irqrestore(&pit_lock, flags);
307 
308     return count32;
309 }
310 
init_pit(struct platform_timesource * pts)311 static s64 __init init_pit(struct platform_timesource *pts)
312 {
313     u8 portb = inb(0x61);
314     u64 start, end;
315     unsigned long count;
316 
317     using_pit = true;
318 
319     /* Set the Gate high, disable speaker. */
320     outb((portb & ~0x02) | 0x01, 0x61);
321 
322     /*
323      * Now let's take care of CTC channel 2: mode 0, (interrupt on
324      * terminal count mode), binary count, load CALIBRATE_LATCH count,
325      * (LSB and MSB) to begin countdown.
326      */
327 #define CALIBRATE_LATCH CALIBRATE_VALUE(CLOCK_TICK_RATE)
328     outb(0xb0, PIT_MODE);                  /* binary, mode 0, LSB/MSB, Ch 2 */
329     outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
330     outb(CALIBRATE_LATCH >> 8, PIT_CH2);   /* MSB of count */
331 #undef CALIBRATE_LATCH
332 
333     start = rdtsc_ordered();
334     for ( count = 0; !(inb(0x61) & 0x20); ++count )
335         continue;
336     end = rdtsc_ordered();
337 
338     /* Set the Gate low, disable speaker. */
339     outb(portb & ~0x03, 0x61);
340 
341     /* Error if the CTC doesn't behave itself. */
342     if ( count == 0 )
343         return 0;
344 
345     return (end - start) * CALIBRATE_FRAC;
346 }
347 
resume_pit(struct platform_timesource * pts)348 static void resume_pit(struct platform_timesource *pts)
349 {
350     /* Set CTC channel 2 to mode 0 again; initial value does not matter. */
351     outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
352     outb(0, PIT_CH2);     /* LSB of count */
353     outb(0, PIT_CH2);     /* MSB of count */
354 }
355 
356 static struct platform_timesource __initdata plt_pit =
357 {
358     .id = "pit",
359     .name = "PIT",
360     .frequency = CLOCK_TICK_RATE,
361     .read_counter = read_pit_count,
362     .counter_bits = 32,
363     .init = init_pit,
364     .resume = resume_pit,
365 };
366 
367 /************************************************************
368  * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
369  */
370 
read_hpet_count(void)371 static u64 read_hpet_count(void)
372 {
373     return hpet_read32(HPET_COUNTER);
374 }
375 
init_hpet(struct platform_timesource * pts)376 static int64_t __init init_hpet(struct platform_timesource *pts)
377 {
378     uint64_t hpet_rate, start;
379     uint32_t count, target;
380 
381     if ( hpet_address && strcmp(opt_clocksource, pts->id) &&
382          cpuidle_using_deep_cstate() )
383     {
384         if ( pci_conf_read16(PCI_SBDF(0, 0, 0x1f, 0),
385                              PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL )
386             switch ( pci_conf_read16(PCI_SBDF(0, 0, 0x1f, 0), PCI_DEVICE_ID) )
387             {
388             /* HPET on Bay Trail platforms will halt in deep C states. */
389             case 0x0f1c:
390             /* HPET on Cherry Trail platforms will halt in deep C states. */
391             case 0x229c:
392                 hpet_address = 0;
393                 break;
394             }
395 
396         /*
397          * Some Coffee Lake platforms have a skewed HPET timer once the SoCs
398          * entered PC10.
399          */
400         if ( pci_conf_read16(PCI_SBDF(0, 0, 0, 0),
401                              PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL &&
402              pci_conf_read16(PCI_SBDF(0, 0, 0, 0),
403                              PCI_DEVICE_ID) == 0x3ec4 )
404             hpet_address = 0;
405 
406         if ( !hpet_address )
407             printk("Disabling HPET for being unreliable\n");
408     }
409 
410     if ( (hpet_rate = hpet_setup()) == 0 )
411         return 0;
412 
413     pts->frequency = hpet_rate;
414 
415     count = hpet_read32(HPET_COUNTER);
416     start = rdtsc_ordered();
417     target = count + CALIBRATE_VALUE(hpet_rate);
418     if ( target < count )
419         while ( hpet_read32(HPET_COUNTER) >= count )
420             continue;
421     while ( hpet_read32(HPET_COUNTER) < target )
422         continue;
423 
424     return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
425 }
426 
resume_hpet(struct platform_timesource * pts)427 static void resume_hpet(struct platform_timesource *pts)
428 {
429     hpet_resume(NULL);
430 }
431 
432 static struct platform_timesource __initdata plt_hpet =
433 {
434     .id = "hpet",
435     .name = "HPET",
436     .read_counter = read_hpet_count,
437     .counter_bits = 32,
438     .init = init_hpet,
439     .resume = resume_hpet
440 };
441 
442 /************************************************************
443  * PLATFORM TIMER 3: ACPI PM TIMER
444  */
445 
446 u32 __read_mostly pmtmr_ioport;
447 unsigned int __initdata pmtmr_width;
448 
449 /* ACPI PM timer ticks at 3.579545 MHz. */
450 #define ACPI_PM_FREQUENCY 3579545
451 
read_pmtimer_count(void)452 static u64 read_pmtimer_count(void)
453 {
454     return inl(pmtmr_ioport);
455 }
456 
init_pmtimer(struct platform_timesource * pts)457 static s64 __init init_pmtimer(struct platform_timesource *pts)
458 {
459     u64 start;
460     u32 count, target, mask;
461 
462     if ( !pmtmr_ioport || (pmtmr_width != 24 && pmtmr_width != 32) )
463         return 0;
464 
465     pts->counter_bits = pmtmr_width;
466     mask = 0xffffffff >> (32 - pmtmr_width);
467 
468     count = inl(pmtmr_ioport) & mask;
469     start = rdtsc_ordered();
470     target = count + CALIBRATE_VALUE(ACPI_PM_FREQUENCY);
471     if ( target < count )
472         while ( (inl(pmtmr_ioport) & mask) >= count )
473             continue;
474     while ( (inl(pmtmr_ioport) & mask) < target )
475         continue;
476 
477     return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
478 }
479 
480 static struct platform_timesource __initdata plt_pmtimer =
481 {
482     .id = "acpi",
483     .name = "ACPI PM Timer",
484     .frequency = ACPI_PM_FREQUENCY,
485     .read_counter = read_pmtimer_count,
486     .init = init_pmtimer
487 };
488 
489 static struct time_scale __read_mostly pmt_scale;
490 static struct time_scale __read_mostly pmt_scale_r;
491 
init_pmtmr_scale(void)492 static __init int init_pmtmr_scale(void)
493 {
494     set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY);
495     pmt_scale_r = scale_reciprocal(pmt_scale);
496     return 0;
497 }
498 __initcall(init_pmtmr_scale);
499 
acpi_pm_tick_to_ns(uint64_t ticks)500 uint64_t acpi_pm_tick_to_ns(uint64_t ticks)
501 {
502     return scale_delta(ticks, &pmt_scale);
503 }
504 
ns_to_acpi_pm_tick(uint64_t ns)505 uint64_t ns_to_acpi_pm_tick(uint64_t ns)
506 {
507     return scale_delta(ns, &pmt_scale_r);
508 }
509 
510 /************************************************************
511  * PLATFORM TIMER 4: TSC
512  */
513 static unsigned int __initdata tsc_flags;
514 
515 /* TSC is reliable across sockets */
516 #define TSC_RELIABLE_SOCKET (1 << 0)
517 
518 /*
519  * Called in verify_tsc_reliability() under reliable TSC conditions
520  * thus reusing all the checks already performed there.
521  */
init_tsc(struct platform_timesource * pts)522 static s64 __init init_tsc(struct platform_timesource *pts)
523 {
524     u64 ret = pts->frequency;
525 
526     if ( nr_cpu_ids != num_present_cpus() )
527     {
528         printk(XENLOG_WARNING "TSC: CPU Hotplug intended\n");
529         ret = 0;
530     }
531 
532     if ( nr_sockets > 1 && !(tsc_flags & TSC_RELIABLE_SOCKET) )
533     {
534         printk(XENLOG_WARNING "TSC: Not invariant across sockets\n");
535         ret = 0;
536     }
537 
538     if ( !ret )
539         printk(XENLOG_DEBUG "TSC: Not setting it as clocksource\n");
540 
541     return ret;
542 }
543 
read_tsc(void)544 static u64 read_tsc(void)
545 {
546     return rdtsc_ordered();
547 }
548 
549 static struct platform_timesource __initdata plt_tsc =
550 {
551     .id = "tsc",
552     .name = "TSC",
553     .read_counter = read_tsc,
554     /*
555      * Calculations for platform timer overflow assume u64 boundary.
556      * Hence we set to less than 64, such that the TSC wraparound is
557      * correctly checked and handled.
558      */
559     .counter_bits = 63,
560     .init = init_tsc,
561 };
562 
563 #ifdef CONFIG_XEN_GUEST
564 /************************************************************
565  * PLATFORM TIMER 5: XEN PV CLOCK SOURCE
566  *
567  * Xen clock source is a variant of TSC source.
568  */
569 static uint64_t xen_timer_last;
570 
xen_timer_cpu_frequency(void)571 static uint64_t xen_timer_cpu_frequency(void)
572 {
573     struct vcpu_time_info *info = &this_cpu(vcpu_info)->time;
574     uint64_t freq;
575 
576     freq = 1000000000ULL << 32;
577     do_div(freq, info->tsc_to_system_mul);
578     if ( info->tsc_shift < 0 )
579         freq <<= -info->tsc_shift;
580     else
581         freq >>= info->tsc_shift;
582 
583     return freq;
584 }
585 
init_xen_timer(struct platform_timesource * pts)586 static int64_t __init init_xen_timer(struct platform_timesource *pts)
587 {
588     if ( !xen_guest )
589         return 0;
590 
591     return xen_timer_cpu_frequency();
592 }
593 
read_cycle(const struct vcpu_time_info * info,uint64_t tsc)594 static always_inline uint64_t read_cycle(const struct vcpu_time_info *info,
595                                          uint64_t tsc)
596 {
597     uint64_t delta = tsc - info->tsc_timestamp;
598     struct time_scale ts = {
599         .shift    = info->tsc_shift,
600         .mul_frac = info->tsc_to_system_mul,
601     };
602     uint64_t offset = scale_delta(delta, &ts);
603 
604     return info->system_time + offset;
605 }
606 
read_xen_timer(void)607 static uint64_t read_xen_timer(void)
608 {
609     struct vcpu_time_info *info = &this_cpu(vcpu_info)->time;
610     uint32_t version;
611     uint64_t ret;
612     uint64_t last;
613 
614     do {
615         version = info->version & ~1;
616         /* Make sure version is read before the data */
617         smp_rmb();
618 
619         ret = read_cycle(info, rdtsc_ordered());
620         /* Ignore fancy flags for now */
621 
622         /* Make sure version is reread after the data */
623         smp_rmb();
624     } while ( unlikely(version != info->version) );
625 
626     /* Maintain a monotonic global value */
627     do {
628         last = read_atomic(&xen_timer_last);
629         if ( ret < last )
630             return last;
631     } while ( unlikely(cmpxchg(&xen_timer_last, last, ret) != last) );
632 
633     return ret;
634 }
635 
resume_xen_timer(struct platform_timesource * pts)636 static void resume_xen_timer(struct platform_timesource *pts)
637 {
638     write_atomic(&xen_timer_last, 0);
639 }
640 
641 static struct platform_timesource __initdata plt_xen_timer =
642 {
643     .id = "xen",
644     .name = "XEN PV CLOCK",
645     .frequency = 1000000000ULL,
646     .read_counter = read_xen_timer,
647     .init = init_xen_timer,
648     .resume = resume_xen_timer,
649     .counter_bits = 63,
650 };
651 #endif
652 
653 #ifdef CONFIG_HYPERV_GUEST
654 /************************************************************
655  * HYPER-V REFERENCE TSC
656  */
657 #include <asm/guest/hyperv-tlfs.h>
658 
659 static struct ms_hyperv_tsc_page *hyperv_tsc;
660 static struct page_info *hyperv_tsc_page;
661 
init_hyperv_timer(struct platform_timesource * pts)662 static int64_t __init init_hyperv_timer(struct platform_timesource *pts)
663 {
664     paddr_t maddr;
665     uint64_t tsc_msr, freq;
666 
667     if ( !(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) ||
668          !(ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS) )
669         return 0;
670 
671     hyperv_tsc_page = alloc_domheap_page(NULL, 0);
672     if ( !hyperv_tsc_page )
673         return 0;
674 
675     hyperv_tsc = __map_domain_page_global(hyperv_tsc_page);
676     if ( !hyperv_tsc )
677     {
678         free_domheap_page(hyperv_tsc_page);
679         hyperv_tsc_page = NULL;
680         return 0;
681     }
682 
683     maddr = page_to_maddr(hyperv_tsc_page);
684 
685     /*
686      * Per Hyper-V TLFS:
687      *   1. Read existing MSR value
688      *   2. Preserve bits [11:1]
689      *   3. Set bits [63:12] to be guest physical address of tsc page
690      *   4. Set enabled bit (0)
691      *   5. Write back new MSR value
692      */
693     rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr);
694     tsc_msr &= 0xffe;
695     tsc_msr |= maddr | 1 /* enabled */;
696     wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr);
697 
698     /* Get TSC frequency from Hyper-V */
699     rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
700     pts->frequency = freq;
701 
702     return freq;
703 }
704 
read_hyperv_timer(void)705 static uint64_t read_hyperv_timer(void)
706 {
707     uint64_t scale, ret, tsc;
708     int64_t offset;
709     uint32_t seq;
710     const struct ms_hyperv_tsc_page *tsc_page = hyperv_tsc;
711 
712     do {
713         seq = tsc_page->tsc_sequence;
714 
715         /* Seq 0 is special. It means the TSC enlightenment is not
716          * available at the moment. The reference time can only be
717          * obtained from the Reference Counter MSR.
718          */
719         if ( seq == 0 )
720         {
721             rdmsrl(HV_X64_MSR_TIME_REF_COUNT, ret);
722             return ret;
723         }
724 
725         /* rdtsc_ordered already contains a load fence */
726         tsc = rdtsc_ordered();
727         scale = tsc_page->tsc_scale;
728         offset = tsc_page->tsc_offset;
729 
730         smp_rmb();
731 
732     } while ( tsc_page->tsc_sequence != seq );
733 
734     return hv_scale_tsc(tsc, scale, offset);
735 }
736 
737 static struct platform_timesource __initdata plt_hyperv_timer =
738 {
739     .id = "hyperv",
740     .name = "HYPER-V REFERENCE TSC",
741     .read_counter = read_hyperv_timer,
742     .init = init_hyperv_timer,
743     /* See TSC time source for why counter_bits is set to 63 */
744     .counter_bits = 63,
745 };
746 #endif
747 
748 /************************************************************
749  * GENERIC PLATFORM TIMER INFRASTRUCTURE
750  */
751 
752 /* details of chosen timesource */
753 static struct platform_timesource __read_mostly plt_src;
754 /* hardware-width mask */
755 static u64 __read_mostly plt_mask;
756  /* ns between calls to plt_overflow() */
757 static u64 __read_mostly plt_overflow_period;
758 /* scale: platform counter -> nanosecs */
759 static struct time_scale __read_mostly plt_scale;
760 
761 /* Protected by platform_timer_lock. */
762 static DEFINE_SPINLOCK(platform_timer_lock);
763 static s_time_t stime_platform_stamp; /* System time at below platform time */
764 static u64 platform_timer_stamp;      /* Platform time at above system time */
765 static u64 plt_stamp64;          /* 64-bit platform counter stamp           */
766 static u64 plt_stamp;            /* hardware-width platform counter stamp   */
767 static struct timer plt_overflow_timer;
768 
__read_platform_stime(u64 platform_time)769 static s_time_t __read_platform_stime(u64 platform_time)
770 {
771     u64 diff = platform_time - platform_timer_stamp;
772     ASSERT(spin_is_locked(&platform_timer_lock));
773     return (stime_platform_stamp + scale_delta(diff, &plt_scale));
774 }
775 
plt_overflow(void * unused)776 static void plt_overflow(void *unused)
777 {
778     int i;
779     u64 count;
780     s_time_t now, plt_now, plt_wrap;
781 
782     spin_lock_irq(&platform_timer_lock);
783 
784     count = plt_src.read_counter();
785     plt_stamp64 += (count - plt_stamp) & plt_mask;
786     plt_stamp = count;
787 
788     now = NOW();
789     plt_wrap = __read_platform_stime(plt_stamp64);
790     for ( i = 0; i < 10; i++ )
791     {
792         plt_now = plt_wrap;
793         plt_wrap = __read_platform_stime(plt_stamp64 + plt_mask + 1);
794         if ( ABS(plt_wrap - now) > ABS(plt_now - now) )
795             break;
796         plt_stamp64 += plt_mask + 1;
797     }
798     if ( i != 0 )
799     {
800         static bool warned_once;
801 
802         if ( !test_and_set_bool(warned_once) )
803             printk("Platform timer appears to have unexpectedly wrapped "
804                    "%u%s times.\n", i, (i == 10) ? " or more" : "");
805     }
806 
807     spin_unlock_irq(&platform_timer_lock);
808 
809     set_timer(&plt_overflow_timer, NOW() + plt_overflow_period);
810 }
811 
read_platform_stime(u64 * stamp)812 static s_time_t read_platform_stime(u64 *stamp)
813 {
814     u64 plt_counter, count;
815     s_time_t stime;
816 
817     ASSERT(!local_irq_is_enabled());
818 
819     spin_lock(&platform_timer_lock);
820     plt_counter = plt_src.read_counter();
821     count = plt_stamp64 + ((plt_counter - plt_stamp) & plt_mask);
822     stime = __read_platform_stime(count);
823     spin_unlock(&platform_timer_lock);
824 
825     if ( unlikely(stamp) )
826         *stamp = plt_counter;
827 
828     return stime;
829 }
830 
platform_time_calibration(void)831 static void platform_time_calibration(void)
832 {
833     u64 count;
834     s_time_t stamp;
835     unsigned long flags;
836 
837     spin_lock_irqsave(&platform_timer_lock, flags);
838     count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
839     stamp = __read_platform_stime(count);
840     stime_platform_stamp = stamp;
841     platform_timer_stamp = count;
842     spin_unlock_irqrestore(&platform_timer_lock, flags);
843 }
844 
resume_platform_timer(void)845 static void resume_platform_timer(void)
846 {
847     /* Timer source can be reset when backing from S3 to S0 */
848     if ( plt_src.resume )
849         plt_src.resume(&plt_src);
850 
851     plt_stamp64 = platform_timer_stamp;
852     plt_stamp = plt_src.read_counter();
853 }
854 
reset_platform_timer(void)855 static void __init reset_platform_timer(void)
856 {
857     /* Deactivate any timers running */
858     kill_timer(&plt_overflow_timer);
859     kill_timer(&calibration_timer);
860 
861     /* Reset counters and stamps */
862     spin_lock_irq(&platform_timer_lock);
863     plt_stamp = 0;
864     plt_stamp64 = 0;
865     platform_timer_stamp = 0;
866     stime_platform_stamp = 0;
867     spin_unlock_irq(&platform_timer_lock);
868 }
869 
try_platform_timer(struct platform_timesource * pts)870 static s64 __init try_platform_timer(struct platform_timesource *pts)
871 {
872     s64 rc = pts->init(pts);
873 
874     if ( rc <= 0 )
875         return rc;
876 
877     /* We have a platform timesource already so reset it */
878     if ( plt_src.counter_bits != 0 )
879         reset_platform_timer();
880 
881     plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
882 
883     set_time_scale(&plt_scale, pts->frequency);
884 
885     plt_overflow_period = scale_delta(
886         1ull << (pts->counter_bits - 1), &plt_scale);
887     plt_src = *pts;
888 
889     return rc;
890 }
891 
init_platform_timer(void)892 static u64 __init init_platform_timer(void)
893 {
894     static struct platform_timesource * __initdata plt_timers[] = {
895 #ifdef CONFIG_XEN_GUEST
896         &plt_xen_timer,
897 #endif
898 #ifdef CONFIG_HYPERV_GUEST
899         &plt_hyperv_timer,
900 #endif
901         &plt_hpet, &plt_pmtimer, &plt_pit
902     };
903 
904     struct platform_timesource *pts = NULL;
905     unsigned int i;
906     s64 rc = -1;
907 
908     /* clocksource=tsc is initialized via __initcalls (when CPUs are up). */
909     if ( (opt_clocksource[0] != '\0') && strcmp(opt_clocksource, "tsc") )
910     {
911         for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
912         {
913             pts = plt_timers[i];
914             if ( !strcmp(opt_clocksource, pts->id) )
915             {
916                 rc = try_platform_timer(pts);
917                 break;
918             }
919         }
920 
921         if ( rc <= 0 )
922             printk("WARNING: %s clocksource '%s'.\n",
923                    (rc == 0) ? "Could not initialise" : "Unrecognised",
924                    opt_clocksource);
925     }
926 
927     if ( rc <= 0 )
928     {
929         for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
930         {
931             pts = plt_timers[i];
932             if ( (rc = try_platform_timer(pts)) > 0 )
933                 break;
934         }
935     }
936 
937     if ( rc <= 0 )
938         panic("Unable to find usable platform timer\n");
939 
940     printk("Platform timer is %s %s\n",
941            freq_string(pts->frequency), pts->name);
942 
943     return rc;
944 }
945 
stime2tsc(s_time_t stime)946 u64 stime2tsc(s_time_t stime)
947 {
948     struct cpu_time *t;
949     struct time_scale sys_to_tsc;
950     s_time_t stime_delta;
951 
952     t = &this_cpu(cpu_time);
953     sys_to_tsc = scale_reciprocal(t->tsc_scale);
954 
955     stime_delta = stime - t->stamp.local_stime;
956     if ( stime_delta < 0 )
957         stime_delta = 0;
958 
959     return t->stamp.local_tsc + scale_delta(stime_delta, &sys_to_tsc);
960 }
961 
cstate_restore_tsc(void)962 void cstate_restore_tsc(void)
963 {
964     struct cpu_time *t = &this_cpu(cpu_time);
965 
966     if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
967         return;
968 
969     t->stamp.master_stime = read_platform_stime(NULL);
970     t->stamp.local_tsc = stime2tsc(t->stamp.master_stime);
971     t->stamp.local_stime = t->stamp.master_stime;
972 
973     write_tsc(t->stamp.local_tsc);
974 }
975 
976 /***************************************************************************
977  * CMOS Timer functions
978  ***************************************************************************/
979 
980 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
981  * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
982  * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
983  *
984  * [For the Julian calendar (which was used in Russia before 1917,
985  * Britain & colonies before 1752, anywhere else before 1582,
986  * and is still in use by some communities) leave out the
987  * -year/100+year/400 terms, and add 10.]
988  *
989  * This algorithm was first published by Gauss (I think).
990  *
991  * WARNING: this function will overflow on 2106-02-07 06:28:16 on
992  * machines were long is 32-bit! (However, as time_t is signed, we
993  * will already get problems at other places on 2038-01-19 03:14:08)
994  */
995 unsigned long
mktime(unsigned int year,unsigned int mon,unsigned int day,unsigned int hour,unsigned int min,unsigned int sec)996 mktime (unsigned int year, unsigned int mon,
997         unsigned int day, unsigned int hour,
998         unsigned int min, unsigned int sec)
999 {
1000     /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */
1001     if ( 0 >= (int) (mon -= 2) )
1002     {
1003         mon += 12;
1004         year -= 1;
1005     }
1006 
1007     return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+
1008               year*365 - 719499
1009         )*24 + hour /* now have hours */
1010         )*60 + min  /* now have minutes */
1011         )*60 + sec; /* finally seconds */
1012 }
1013 
1014 struct rtc_time {
1015     unsigned int year, mon, day, hour, min, sec;
1016 };
1017 
__get_cmos_time(struct rtc_time * rtc)1018 static void __get_cmos_time(struct rtc_time *rtc)
1019 {
1020     rtc->sec  = CMOS_READ(RTC_SECONDS);
1021     rtc->min  = CMOS_READ(RTC_MINUTES);
1022     rtc->hour = CMOS_READ(RTC_HOURS);
1023     rtc->day  = CMOS_READ(RTC_DAY_OF_MONTH);
1024     rtc->mon  = CMOS_READ(RTC_MONTH);
1025     rtc->year = CMOS_READ(RTC_YEAR);
1026 
1027     if ( RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) )
1028     {
1029         BCD_TO_BIN(rtc->sec);
1030         BCD_TO_BIN(rtc->min);
1031         BCD_TO_BIN(rtc->hour);
1032         BCD_TO_BIN(rtc->day);
1033         BCD_TO_BIN(rtc->mon);
1034         BCD_TO_BIN(rtc->year);
1035     }
1036 
1037     if ( (rtc->year += 1900) < 1970 )
1038         rtc->year += 100;
1039 }
1040 
get_cmos_time(void)1041 static unsigned long get_cmos_time(void)
1042 {
1043     unsigned long res, flags;
1044     struct rtc_time rtc;
1045     unsigned int seconds = 60;
1046     static bool __read_mostly cmos_rtc_probe;
1047     boolean_param("cmos-rtc-probe", cmos_rtc_probe);
1048 
1049     if ( efi_enabled(EFI_RS) )
1050     {
1051         res = efi_get_time();
1052         if ( res )
1053             return res;
1054     }
1055 
1056     if ( likely(!(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC)) )
1057         cmos_rtc_probe = false;
1058     else if ( system_state < SYS_STATE_smp_boot && !cmos_rtc_probe )
1059         panic("System with no CMOS RTC advertised must be booted from EFI"
1060               " (or with command line option \"cmos-rtc-probe\")\n");
1061 
1062     for ( ; ; )
1063     {
1064         s_time_t start, t1, t2;
1065 
1066         spin_lock_irqsave(&rtc_lock, flags);
1067 
1068         /* read RTC exactly on falling edge of update flag */
1069         start = NOW();
1070         do { /* may take up to 1 second... */
1071             t1 = NOW() - start;
1072         } while ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
1073                   t1 <= SECONDS(1) );
1074 
1075         start = NOW();
1076         do { /* must try at least 2.228 ms */
1077             t2 = NOW() - start;
1078         } while ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
1079                   t2 < MILLISECS(3) );
1080 
1081         __get_cmos_time(&rtc);
1082 
1083         spin_unlock_irqrestore(&rtc_lock, flags);
1084 
1085         if ( likely(!cmos_rtc_probe) ||
1086              t1 > SECONDS(1) || t2 >= MILLISECS(3) ||
1087              rtc.sec >= 60 || rtc.min >= 60 || rtc.hour >= 24 ||
1088              !rtc.day || rtc.day > 31 ||
1089              !rtc.mon || rtc.mon > 12 )
1090             break;
1091 
1092         if ( seconds < 60 )
1093         {
1094             if ( rtc.sec != seconds )
1095                 cmos_rtc_probe = false;
1096             break;
1097         }
1098 
1099         process_pending_softirqs();
1100 
1101         seconds = rtc.sec;
1102     }
1103 
1104     if ( unlikely(cmos_rtc_probe) )
1105         panic("No CMOS RTC found - system must be booted from EFI\n");
1106 
1107     return mktime(rtc.year, rtc.mon, rtc.day, rtc.hour, rtc.min, rtc.sec);
1108 }
1109 
1110 /* Helpers for guest accesses to the physical RTC. */
rtc_guest_read(unsigned int port)1111 unsigned int rtc_guest_read(unsigned int port)
1112 {
1113     const struct domain *currd = current->domain;
1114     unsigned long flags;
1115     unsigned int data = ~0;
1116 
1117     switch ( port )
1118     {
1119     case RTC_PORT(0):
1120         /*
1121          * All PV domains (and PVH dom0) are allowed to read the latched value
1122          * of the first RTC port, as there's no access to the physical IO
1123          * ports.
1124          */
1125         data = currd->arch.cmos_idx;
1126         break;
1127 
1128     case RTC_PORT(1):
1129         if ( !ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
1130             break;
1131         spin_lock_irqsave(&rtc_lock, flags);
1132         outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
1133         data = inb(RTC_PORT(1));
1134         spin_unlock_irqrestore(&rtc_lock, flags);
1135         break;
1136 
1137     default:
1138         ASSERT_UNREACHABLE();
1139     }
1140 
1141     return data;
1142 }
1143 
rtc_guest_write(unsigned int port,unsigned int data)1144 void rtc_guest_write(unsigned int port, unsigned int data)
1145 {
1146     struct domain *currd = current->domain;
1147     unsigned long flags;
1148 
1149     switch ( port )
1150     {
1151         typeof(pv_rtc_handler) hook;
1152 
1153     case RTC_PORT(0):
1154         /*
1155          * All PV domains (and PVH dom0) are allowed to write to the latched
1156          * value of the first RTC port, as there's no access to the physical IO
1157          * ports.
1158          */
1159         currd->arch.cmos_idx = data;
1160         break;
1161 
1162     case RTC_PORT(1):
1163         if ( !ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
1164             break;
1165 
1166         hook = ACCESS_ONCE(pv_rtc_handler);
1167         if ( hook )
1168             hook(currd->arch.cmos_idx & 0x7f, data);
1169 
1170         spin_lock_irqsave(&rtc_lock, flags);
1171         outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
1172         outb(data, RTC_PORT(1));
1173         spin_unlock_irqrestore(&rtc_lock, flags);
1174         break;
1175 
1176     default:
1177         ASSERT_UNREACHABLE();
1178     }
1179 }
1180 
get_wallclock_time(void)1181 static unsigned long get_wallclock_time(void)
1182 {
1183 #ifdef CONFIG_XEN_GUEST
1184     if ( xen_guest )
1185     {
1186         struct shared_info *sh_info = XEN_shared_info;
1187         uint32_t wc_version;
1188         uint64_t wc_sec;
1189 
1190         do {
1191             wc_version = sh_info->wc_version & ~1;
1192             smp_rmb();
1193 
1194             wc_sec  = sh_info->wc_sec;
1195             smp_rmb();
1196         } while ( wc_version != sh_info->wc_version );
1197 
1198         return wc_sec + read_xen_timer() / 1000000000;
1199     }
1200 #endif
1201 
1202     return get_cmos_time();
1203 }
1204 
1205 /***************************************************************************
1206  * System Time
1207  ***************************************************************************/
1208 
get_s_time_fixed(u64 at_tsc)1209 s_time_t get_s_time_fixed(u64 at_tsc)
1210 {
1211     const struct cpu_time *t = &this_cpu(cpu_time);
1212     u64 tsc, delta;
1213 
1214     if ( at_tsc )
1215         tsc = at_tsc;
1216     else
1217         tsc = rdtsc_ordered();
1218     delta = tsc - t->stamp.local_tsc;
1219     return t->stamp.local_stime + scale_delta(delta, &t->tsc_scale);
1220 }
1221 
get_s_time()1222 s_time_t get_s_time()
1223 {
1224     return get_s_time_fixed(0);
1225 }
1226 
tsc_ticks2ns(uint64_t ticks)1227 uint64_t tsc_ticks2ns(uint64_t ticks)
1228 {
1229     struct cpu_time *t = &this_cpu(cpu_time);
1230 
1231     return scale_delta(ticks, &t->tsc_scale);
1232 }
1233 
__update_vcpu_system_time(struct vcpu * v,int force)1234 static void __update_vcpu_system_time(struct vcpu *v, int force)
1235 {
1236     const struct cpu_time *t;
1237     struct vcpu_time_info *u, _u = {};
1238     struct domain *d = v->domain;
1239     s_time_t tsc_stamp;
1240 
1241     if ( v->vcpu_info == NULL )
1242         return;
1243 
1244     t = &this_cpu(cpu_time);
1245     u = &vcpu_info(v, time);
1246 
1247     if ( d->arch.vtsc )
1248     {
1249         s_time_t stime = t->stamp.local_stime;
1250 
1251         if ( is_hvm_domain(d) )
1252         {
1253             struct pl_time *pl = v->domain->arch.hvm.pl_time;
1254 
1255             stime += pl->stime_offset + v->arch.hvm.stime_offset;
1256             if ( stime >= 0 )
1257                 tsc_stamp = gtime_to_gtsc(d, stime);
1258             else
1259                 tsc_stamp = -gtime_to_gtsc(d, -stime);
1260         }
1261         else
1262             tsc_stamp = gtime_to_gtsc(d, stime);
1263 
1264         _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
1265         _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
1266     }
1267     else
1268     {
1269         if ( is_hvm_domain(d) && hvm_tsc_scaling_supported )
1270         {
1271             tsc_stamp            = hvm_scale_tsc(d, t->stamp.local_tsc);
1272             _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
1273             _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
1274         }
1275         else
1276         {
1277             tsc_stamp            = t->stamp.local_tsc;
1278             _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
1279             _u.tsc_shift         = t->tsc_scale.shift;
1280         }
1281     }
1282 
1283     _u.tsc_timestamp = tsc_stamp;
1284     _u.system_time   = t->stamp.local_stime;
1285 
1286     /*
1287      * It's expected that domains cope with this bit changing on every
1288      * pvclock read to check whether they can resort solely on this tuple
1289      * or if it further requires monotonicity checks with other vcpus.
1290      */
1291     if ( clocksource_is_tsc() )
1292         _u.flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
1293 
1294     if ( is_hvm_domain(d) )
1295         _u.tsc_timestamp += v->arch.hvm.cache_tsc_offset;
1296 
1297     /* Don't bother unless timestamp record has changed or we are forced. */
1298     _u.version = u->version; /* make versions match for memcmp test */
1299     if ( !force && !memcmp(u, &_u, sizeof(_u)) )
1300         return;
1301 
1302     /* 1. Update guest kernel version. */
1303     _u.version = u->version = version_update_begin(u->version);
1304     smp_wmb();
1305     /* 2. Update all other guest kernel fields. */
1306     *u = _u;
1307     smp_wmb();
1308     /* 3. Update guest kernel version. */
1309     u->version = version_update_end(u->version);
1310 
1311     if ( !update_secondary_system_time(v, &_u) && is_pv_domain(d) &&
1312          !is_pv_32bit_domain(d) && !(v->arch.flags & TF_kernel_mode) )
1313         v->arch.pv.pending_system_time = _u;
1314 }
1315 
update_secondary_system_time(struct vcpu * v,struct vcpu_time_info * u)1316 bool update_secondary_system_time(struct vcpu *v,
1317                                   struct vcpu_time_info *u)
1318 {
1319     XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest;
1320     struct guest_memory_policy policy = { .nested_guest_mode = false };
1321 
1322     if ( guest_handle_is_null(user_u) )
1323         return true;
1324 
1325     update_guest_memory_policy(v, &policy);
1326 
1327     /* 1. Update userspace version. */
1328     if ( __copy_field_to_guest(user_u, u, version) == sizeof(u->version) )
1329     {
1330         update_guest_memory_policy(v, &policy);
1331         return false;
1332     }
1333     smp_wmb();
1334     /* 2. Update all other userspace fields. */
1335     __copy_to_guest(user_u, u, 1);
1336     smp_wmb();
1337     /* 3. Update userspace version. */
1338     u->version = version_update_end(u->version);
1339     __copy_field_to_guest(user_u, u, version);
1340 
1341     update_guest_memory_policy(v, &policy);
1342 
1343     return true;
1344 }
1345 
update_vcpu_system_time(struct vcpu * v)1346 void update_vcpu_system_time(struct vcpu *v)
1347 {
1348     __update_vcpu_system_time(v, 0);
1349 }
1350 
force_update_vcpu_system_time(struct vcpu * v)1351 void force_update_vcpu_system_time(struct vcpu *v)
1352 {
1353     __update_vcpu_system_time(v, 1);
1354 }
1355 
update_domain_rtc(void)1356 static void update_domain_rtc(void)
1357 {
1358     struct domain *d;
1359 
1360     rcu_read_lock(&domlist_read_lock);
1361 
1362     for_each_domain ( d )
1363         if ( is_hvm_domain(d) )
1364             rtc_update_clock(d);
1365 
1366     rcu_read_unlock(&domlist_read_lock);
1367 }
1368 
domain_set_time_offset(struct domain * d,int64_t time_offset_seconds)1369 void domain_set_time_offset(struct domain *d, int64_t time_offset_seconds)
1370 {
1371     d->time_offset.seconds = time_offset_seconds;
1372     d->time_offset.set = true;
1373     if ( is_hvm_domain(d) )
1374         rtc_update_clock(d);
1375     update_domain_wallclock_time(d);
1376 }
1377 
cpu_frequency_change(u64 freq)1378 int cpu_frequency_change(u64 freq)
1379 {
1380     struct cpu_time *t = &this_cpu(cpu_time);
1381     u64 curr_tsc;
1382 
1383     /* Sanity check: CPU frequency allegedly dropping below 1MHz? */
1384     if ( freq < 1000000u )
1385     {
1386         printk(XENLOG_WARNING "Rejecting CPU frequency change "
1387                "to %"PRIu64" Hz\n", freq);
1388         return -EINVAL;
1389     }
1390 
1391     local_irq_disable();
1392     /* Platform time /first/, as we may be delayed by platform_timer_lock. */
1393     t->stamp.master_stime = read_platform_stime(NULL);
1394     curr_tsc = rdtsc_ordered();
1395     /* TSC-extrapolated time may be bogus after frequency change. */
1396     /*t->stamp.local_stime = get_s_time_fixed(curr_tsc);*/
1397     t->stamp.local_stime = t->stamp.master_stime;
1398     t->stamp.local_tsc = curr_tsc;
1399     set_time_scale(&t->tsc_scale, freq);
1400     local_irq_enable();
1401 
1402     update_vcpu_system_time(current);
1403 
1404     /* A full epoch should pass before we check for deviation. */
1405     if ( smp_processor_id() == 0 )
1406     {
1407         set_timer(&calibration_timer, NOW() + EPOCH);
1408         platform_time_calibration();
1409     }
1410 
1411     return 0;
1412 }
1413 
1414 /* Per-CPU communication between rendezvous IRQ and softirq handler. */
1415 static DEFINE_PER_CPU(struct cpu_time_stamp, cpu_calibration);
1416 
1417 /* Softirq handler for per-CPU time calibration. */
local_time_calibration(void)1418 static void local_time_calibration(void)
1419 {
1420     struct cpu_time *t = &this_cpu(cpu_time);
1421     const struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1422 
1423     /*
1424      * System (extrapolated from local and master oscillators) and TSC
1425      * timestamps, taken during this calibration and the previous one.
1426      */
1427     struct cpu_time_stamp prev, curr;
1428 
1429     /*
1430      * System time and TSC ticks elapsed during the previous calibration
1431      * 'epoch'. These values are down-shifted to fit in 32 bits.
1432      */
1433     u64 stime_elapsed64, tsc_elapsed64;
1434     u32 stime_elapsed32, tsc_elapsed32;
1435 
1436     /* Error correction to slow down a fast local clock. */
1437     u32 error_factor = 0;
1438 
1439     /* Calculated TSC shift to ensure 32-bit scale multiplier. */
1440     int tsc_shift = 0;
1441 
1442     /* The overall calibration scale multiplier. */
1443     u32 calibration_mul_frac;
1444 
1445     if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1446     {
1447         /* Atomically read cpu_calibration struct and write cpu_time struct. */
1448         local_irq_disable();
1449         t->stamp = *c;
1450         local_irq_enable();
1451         update_vcpu_system_time(current);
1452         goto out;
1453     }
1454 
1455     prev = t->stamp;
1456 
1457     /* Disabling IRQs ensures we atomically read cpu_calibration struct. */
1458     local_irq_disable();
1459     curr = *c;
1460     local_irq_enable();
1461 
1462 #if 0
1463     printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n",
1464            smp_processor_id(), prev.local_tsc, prev.local_stime, prev.master_stime);
1465     printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64
1466            " -> %"PRId64"\n",
1467            smp_processor_id(), curr.local_tsc, curr.local_stime, curr.master_stime,
1468            curr.master_stime - curr.local_stime);
1469 #endif
1470 
1471     /* Local time warps forward if it lags behind master time. */
1472     if ( curr.local_stime < curr.master_stime )
1473         curr.local_stime = curr.master_stime;
1474 
1475     stime_elapsed64 = curr.master_stime - prev.master_stime;
1476     tsc_elapsed64   = curr.local_tsc - prev.local_tsc;
1477 
1478     /*
1479      * Weirdness can happen if we lose sync with the platform timer.
1480      * We could be smarter here: resync platform timer with local timer?
1481      */
1482     if ( ((s64)stime_elapsed64 < (EPOCH / 2)) )
1483         goto out;
1484 
1485     /*
1486      * Calculate error-correction factor. This only slows down a fast local
1487      * clock (slow clocks are warped forwards). The scale factor is clamped
1488      * to >= 0.5.
1489      */
1490     if ( curr.local_stime != curr.master_stime )
1491     {
1492         u64 local_stime_err = curr.local_stime - curr.master_stime;
1493 
1494         if ( local_stime_err > EPOCH )
1495             local_stime_err = EPOCH;
1496         error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err);
1497     }
1498 
1499     /*
1500      * We require 0 < stime_elapsed < 2^31.
1501      * This allows us to binary shift a 32-bit tsc_elapsed such that:
1502      * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
1503      */
1504     while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
1505             ((s32)stime_elapsed64 < 0) )
1506     {
1507         stime_elapsed64 >>= 1;
1508         tsc_elapsed64   >>= 1;
1509     }
1510 
1511     /* stime_master_diff now fits in a 32-bit word. */
1512     stime_elapsed32 = (u32)stime_elapsed64;
1513 
1514     /* tsc_elapsed <= 2*stime_elapsed */
1515     while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
1516     {
1517         tsc_elapsed64 >>= 1;
1518         tsc_shift--;
1519     }
1520 
1521     /* Local difference must now fit in 32 bits. */
1522     ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
1523     tsc_elapsed32 = (u32)tsc_elapsed64;
1524 
1525     /* tsc_elapsed > stime_elapsed */
1526     ASSERT(tsc_elapsed32 != 0);
1527     while ( tsc_elapsed32 <= stime_elapsed32 )
1528     {
1529         tsc_elapsed32 <<= 1;
1530         tsc_shift++;
1531     }
1532 
1533     calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32);
1534     if ( error_factor != 0 )
1535         calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor);
1536 
1537 #if 0
1538     printk("---%d: %08x %08x %d\n", smp_processor_id(),
1539            error_factor, calibration_mul_frac, tsc_shift);
1540 #endif
1541 
1542     /* Record new timestamp information, atomically w.r.t. interrupts. */
1543     local_irq_disable();
1544     t->tsc_scale.mul_frac = calibration_mul_frac;
1545     t->tsc_scale.shift    = tsc_shift;
1546     t->stamp              = curr;
1547     local_irq_enable();
1548 
1549     update_vcpu_system_time(current);
1550 
1551  out:
1552     if ( smp_processor_id() == 0 )
1553     {
1554         set_timer(&calibration_timer, NOW() + EPOCH);
1555         platform_time_calibration();
1556     }
1557 }
1558 
1559 /*
1560  * TSC Reliability check
1561  */
1562 
1563 /*
1564  * The Linux original version of this function is
1565  * Copyright (c) 2006, Red Hat, Inc., Ingo Molnar
1566  */
check_tsc_warp(unsigned long tsc_khz,unsigned long * max_warp)1567 static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
1568 {
1569     static DEFINE_SPINLOCK(sync_lock);
1570     static cycles_t last_tsc;
1571 
1572     cycles_t start, now, prev, end;
1573     int i;
1574 
1575     start = rdtsc_ordered();
1576 
1577     /* The measurement runs for 20 msecs: */
1578     end = start + tsc_khz * 20ULL;
1579     now = start;
1580 
1581     for ( i = 0; ; i++ )
1582     {
1583         /*
1584          * We take the global lock, measure TSC, save the
1585          * previous TSC that was measured (possibly on
1586          * another CPU) and update the previous TSC timestamp.
1587          */
1588         spin_lock(&sync_lock);
1589         prev = last_tsc;
1590         now = rdtsc_ordered();
1591         last_tsc = now;
1592         spin_unlock(&sync_lock);
1593 
1594         /*
1595          * Be nice every now and then (and also check whether measurement is
1596          * done [we also insert a 10 million loops safety exit, so we dont
1597          * lock up in case the TSC readout is totally broken]):
1598          */
1599         if ( unlikely(!(i & 7)) )
1600         {
1601             if ( (now > end) || (i > 10000000) )
1602                 break;
1603             cpu_relax();
1604             /*touch_nmi_watchdog();*/
1605         }
1606 
1607         /*
1608          * Outside the critical section we can now see whether we saw a
1609          * time-warp of the TSC going backwards:
1610          */
1611         if ( unlikely(prev > now) )
1612         {
1613             spin_lock(&sync_lock);
1614             if ( *max_warp < prev - now )
1615                 *max_warp = prev - now;
1616             spin_unlock(&sync_lock);
1617         }
1618     }
1619 }
1620 
1621 static unsigned long tsc_max_warp, tsc_check_count;
1622 static cpumask_t tsc_check_cpumask;
1623 
tsc_check_slave(void * unused)1624 static void tsc_check_slave(void *unused)
1625 {
1626     unsigned int cpu = smp_processor_id();
1627     local_irq_disable();
1628     while ( !cpumask_test_cpu(cpu, &tsc_check_cpumask) )
1629         cpu_relax();
1630     check_tsc_warp(cpu_khz, &tsc_max_warp);
1631     cpumask_clear_cpu(cpu, &tsc_check_cpumask);
1632     local_irq_enable();
1633 }
1634 
tsc_check_reliability(void)1635 static void tsc_check_reliability(void)
1636 {
1637     unsigned int cpu = smp_processor_id();
1638     static DEFINE_SPINLOCK(lock);
1639 
1640     spin_lock(&lock);
1641 
1642     tsc_check_count++;
1643     smp_call_function(tsc_check_slave, NULL, 0);
1644     cpumask_andnot(&tsc_check_cpumask, &cpu_online_map, cpumask_of(cpu));
1645     local_irq_disable();
1646     check_tsc_warp(cpu_khz, &tsc_max_warp);
1647     local_irq_enable();
1648     while ( !cpumask_empty(&tsc_check_cpumask) )
1649         cpu_relax();
1650 
1651     spin_unlock(&lock);
1652 }
1653 
1654 /*
1655  * Rendezvous for all CPUs in IRQ context.
1656  * Master CPU snapshots the platform timer.
1657  * All CPUS snapshot their local TSC and extrapolation of system time.
1658  */
1659 struct calibration_rendezvous {
1660     cpumask_t cpu_calibration_map;
1661     atomic_t semaphore;
1662     s_time_t master_stime;
1663     u64 master_tsc_stamp;
1664 };
1665 
1666 static void
time_calibration_rendezvous_tail(const struct calibration_rendezvous * r)1667 time_calibration_rendezvous_tail(const struct calibration_rendezvous *r)
1668 {
1669     struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1670 
1671     c->local_tsc    = rdtsc_ordered();
1672     c->local_stime  = get_s_time_fixed(c->local_tsc);
1673     c->master_stime = r->master_stime;
1674 
1675     raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1676 }
1677 
1678 /*
1679  * Keep TSCs in sync when they run at the same rate, but may stop in
1680  * deep-sleep C states.
1681  */
time_calibration_tsc_rendezvous(void * _r)1682 static void time_calibration_tsc_rendezvous(void *_r)
1683 {
1684     int i;
1685     struct calibration_rendezvous *r = _r;
1686     unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1687 
1688     /* Loop to get rid of cache effects on TSC skew. */
1689     for ( i = 4; i >= 0; i-- )
1690     {
1691         if ( smp_processor_id() == 0 )
1692         {
1693             while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1694                 cpu_relax();
1695 
1696             if ( r->master_stime == 0 )
1697             {
1698                 r->master_stime = read_platform_stime(NULL);
1699                 r->master_tsc_stamp = rdtsc_ordered();
1700             }
1701             atomic_inc(&r->semaphore);
1702 
1703             if ( i == 0 )
1704                 write_tsc(r->master_tsc_stamp);
1705 
1706             while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) )
1707                 cpu_relax();
1708             atomic_set(&r->semaphore, 0);
1709         }
1710         else
1711         {
1712             atomic_inc(&r->semaphore);
1713             while ( atomic_read(&r->semaphore) < total_cpus )
1714                 cpu_relax();
1715 
1716             if ( i == 0 )
1717                 write_tsc(r->master_tsc_stamp);
1718 
1719             atomic_inc(&r->semaphore);
1720             while ( atomic_read(&r->semaphore) > total_cpus )
1721                 cpu_relax();
1722         }
1723     }
1724 
1725     time_calibration_rendezvous_tail(r);
1726 }
1727 
1728 /* Ordinary rendezvous function which does not modify TSC values. */
time_calibration_std_rendezvous(void * _r)1729 static void time_calibration_std_rendezvous(void *_r)
1730 {
1731     struct calibration_rendezvous *r = _r;
1732     unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1733 
1734     if ( smp_processor_id() == 0 )
1735     {
1736         while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1737             cpu_relax();
1738         r->master_stime = read_platform_stime(NULL);
1739         smp_wmb(); /* write r->master_stime /then/ signal */
1740         atomic_inc(&r->semaphore);
1741     }
1742     else
1743     {
1744         atomic_inc(&r->semaphore);
1745         while ( atomic_read(&r->semaphore) != total_cpus )
1746             cpu_relax();
1747         smp_rmb(); /* receive signal /then/ read r->master_stime */
1748     }
1749 
1750     time_calibration_rendezvous_tail(r);
1751 }
1752 
1753 /*
1754  * Rendezvous function used when clocksource is TSC and
1755  * no CPU hotplug will be performed.
1756  */
time_calibration_nop_rendezvous(void * rv)1757 static void time_calibration_nop_rendezvous(void *rv)
1758 {
1759     const struct calibration_rendezvous *r = rv;
1760     struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1761 
1762     c->local_tsc    = r->master_tsc_stamp;
1763     c->local_stime  = r->master_stime;
1764     c->master_stime = r->master_stime;
1765 
1766     raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1767 }
1768 
1769 static void (*time_calibration_rendezvous_fn)(void *) =
1770     time_calibration_std_rendezvous;
1771 
time_calibration(void * unused)1772 static void time_calibration(void *unused)
1773 {
1774     struct calibration_rendezvous r = {
1775         .semaphore = ATOMIC_INIT(0)
1776     };
1777 
1778     if ( clocksource_is_tsc() )
1779     {
1780         local_irq_disable();
1781         r.master_stime = read_platform_stime(&r.master_tsc_stamp);
1782         local_irq_enable();
1783     }
1784 
1785     cpumask_copy(&r.cpu_calibration_map, &cpu_online_map);
1786 
1787     /* @wait=1 because we must wait for all cpus before freeing @r. */
1788     on_selected_cpus(&r.cpu_calibration_map,
1789                      time_calibration_rendezvous_fn,
1790                      &r, 1);
1791 }
1792 
1793 static struct cpu_time_stamp ap_bringup_ref;
1794 
time_latch_stamps(void)1795 void time_latch_stamps(void)
1796 {
1797     unsigned long flags;
1798 
1799     local_irq_save(flags);
1800     ap_bringup_ref.master_stime = read_platform_stime(NULL);
1801     ap_bringup_ref.local_tsc = rdtsc_ordered();
1802     local_irq_restore(flags);
1803 
1804     ap_bringup_ref.local_stime = get_s_time_fixed(ap_bringup_ref.local_tsc);
1805 }
1806 
init_percpu_time(void)1807 void init_percpu_time(void)
1808 {
1809     struct cpu_time *t = &this_cpu(cpu_time);
1810     unsigned long flags;
1811     u64 tsc;
1812     s_time_t now;
1813 
1814     /* Initial estimate for TSC rate. */
1815     t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
1816 
1817     if ( tsc_adjust )
1818     {
1819         unsigned int socket = cpu_to_socket(smp_processor_id());
1820         int64_t adj;
1821 
1822         /* For now we don't want to come here for the BSP. */
1823         ASSERT(system_state >= SYS_STATE_smp_boot);
1824 
1825         rdmsrl(MSR_IA32_TSC_ADJUST, adj);
1826 
1827         /*
1828          * Check whether this CPU is the first in a package to come up. In
1829          * this case do not check the boot value against another package
1830          * because the new package might have been physically hotplugged,
1831          * where TSC_ADJUST is expected to be different.
1832          */
1833         if ( cpumask_weight(socket_cpumask[socket]) == 1 )
1834         {
1835             /*
1836              * On the boot CPU we just force the ADJUST value to 0 if it's non-
1837              * zero (in early_time_init()). We don't do that on non-boot CPUs
1838              * because physical hotplug should have set the ADJUST register to a
1839              * value > 0, so the TSC is in sync with the already running CPUs.
1840              *
1841              * But we always force non-negative ADJUST values for now.
1842              */
1843             if ( adj < 0 )
1844             {
1845                 printk(XENLOG_WARNING
1846                        "TSC ADJUST set to -%lx on CPU%u - clearing\n",
1847                        -adj, smp_processor_id());
1848                 wrmsrl(MSR_IA32_TSC_ADJUST, 0);
1849                 adj = 0;
1850             }
1851             tsc_adjust[socket] = adj;
1852         }
1853         else if ( adj != tsc_adjust[socket] )
1854         {
1855             static bool __read_mostly warned;
1856 
1857             if ( !warned )
1858             {
1859                 warned = true;
1860                 printk(XENLOG_WARNING
1861                        "Differing TSC ADJUST values within socket(s) - fixing all\n");
1862             }
1863             wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust[socket]);
1864         }
1865     }
1866 
1867     local_irq_save(flags);
1868     now = read_platform_stime(NULL);
1869     tsc = rdtsc_ordered();
1870     local_irq_restore(flags);
1871 
1872     t->stamp.master_stime = now;
1873     /*
1874      * To avoid a discontinuity (TSC and platform clock can't be expected
1875      * to be in perfect sync), initialization here needs to match up with
1876      * local_time_calibration()'s decision whether to use its fast path.
1877      */
1878     if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1879     {
1880         if ( system_state < SYS_STATE_smp_boot )
1881             now = get_s_time_fixed(tsc);
1882         else
1883             now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
1884     }
1885     t->stamp.local_tsc   = tsc;
1886     t->stamp.local_stime = now;
1887 }
1888 
1889 /*
1890  * On certain older Intel CPUs writing the TSC MSR clears the upper 32 bits.
1891  * Obviously we must not use write_tsc() on such CPUs.
1892  *
1893  * Additionally, AMD specifies that being able to write the TSC MSR is not an
1894  * architectural feature (but, other than their manual says, also cannot be
1895  * determined from CPUID bits).
1896  */
tsc_check_writability(void)1897 static void __init tsc_check_writability(void)
1898 {
1899     const char *what = NULL;
1900     uint64_t tsc;
1901 
1902     /*
1903      * If all CPUs are reported as synchronised and in sync, we never write
1904      * the TSCs (except unavoidably, when a CPU is physically hot-plugged).
1905      * Hence testing for writability is pointless and even harmful.
1906      */
1907     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1908         return;
1909 
1910     tsc = rdtsc();
1911     if ( wrmsr_safe(MSR_IA32_TSC, 0) == 0 )
1912     {
1913         uint64_t tmp, tmp2 = rdtsc();
1914 
1915         write_tsc(tsc | (1ULL << 32));
1916         tmp = rdtsc();
1917         if ( ABS((s64)tmp - (s64)tmp2) < (1LL << 31) )
1918             what = "only partially";
1919     }
1920     else
1921     {
1922         what = "not";
1923     }
1924 
1925     /* Nothing to do if the TSC is fully writable. */
1926     if ( !what )
1927     {
1928         /*
1929          * Paranoia - write back original TSC value. However, APs get synced
1930          * with BSP as they are brought up, so this doesn't much matter.
1931          */
1932         write_tsc(tsc);
1933         return;
1934     }
1935 
1936     printk(XENLOG_WARNING "TSC %s writable\n", what);
1937 
1938     /* time_calibration_tsc_rendezvous() must not be used */
1939     setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
1940 
1941     /* cstate_restore_tsc() must not be used (or do nothing) */
1942     if ( !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
1943         cpuidle_disable_deep_cstate();
1944 
1945     /* synchronize_tsc_slave() must do nothing */
1946     disable_tsc_sync = true;
1947 }
1948 
reset_percpu_time(void * unused)1949 static void __init reset_percpu_time(void *unused)
1950 {
1951     struct cpu_time *t = &this_cpu(cpu_time);
1952 
1953     t->stamp.local_tsc = boot_tsc_stamp;
1954     t->stamp.local_stime = 0;
1955     t->stamp.local_stime = get_s_time_fixed(boot_tsc_stamp);
1956     t->stamp.master_stime = t->stamp.local_stime;
1957 }
1958 
try_platform_timer_tail(bool late)1959 static void __init try_platform_timer_tail(bool late)
1960 {
1961     init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
1962     plt_overflow(NULL);
1963 
1964     platform_timer_stamp = plt_stamp64;
1965     stime_platform_stamp = NOW();
1966 
1967     if ( !late )
1968         init_percpu_time();
1969 
1970     init_timer(&calibration_timer, time_calibration, NULL, 0);
1971     set_timer(&calibration_timer, NOW() + EPOCH);
1972 }
1973 
1974 /* Late init function, after all cpus have booted */
verify_tsc_reliability(void)1975 static int __init verify_tsc_reliability(void)
1976 {
1977     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1978     {
1979         /*
1980          * Sadly, despite processor vendors' best design guidance efforts, on
1981          * some systems, cpus may come out of reset improperly synchronized.
1982          * So we must verify there is no warp and we can't do that until all
1983          * CPUs are booted.
1984          */
1985         tsc_check_reliability();
1986         if ( tsc_max_warp )
1987         {
1988             printk("TSC warp detected, disabling TSC_RELIABLE\n");
1989             setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1990         }
1991         else if ( !strcmp(opt_clocksource, "tsc") &&
1992                   (try_platform_timer(&plt_tsc) > 0) )
1993         {
1994             /*
1995              * Platform timer has changed and CPU time will only be updated
1996              * after we set again the calibration timer, which means we need to
1997              * seed again each local CPU time. At this stage TSC is known to be
1998              * reliable i.e. monotonically increasing across all CPUs so this
1999              * lets us remove the skew between platform timer and TSC, since
2000              * these are now effectively the same.
2001              */
2002             on_selected_cpus(&cpu_online_map, reset_percpu_time, NULL, 1);
2003 
2004             /*
2005              * We won't do CPU Hotplug and TSC clocksource is being used which
2006              * means we have a reliable TSC, plus we don't sync with any other
2007              * clocksource so no need for rendezvous.
2008              */
2009             time_calibration_rendezvous_fn = time_calibration_nop_rendezvous;
2010 
2011             /* Finish platform timer switch. */
2012             try_platform_timer_tail(true);
2013 
2014             printk("Switched to Platform timer %s TSC\n",
2015                    freq_string(plt_src.frequency));
2016             return 0;
2017         }
2018     }
2019 
2020     /*
2021      * Re-run the TSC writability check if it didn't run to completion, as
2022      * X86_FEATURE_TSC_RELIABLE may have been cleared by now. This is needed
2023      * for determining which rendezvous function to use (below).
2024      */
2025     if ( !disable_tsc_sync )
2026         tsc_check_writability();
2027 
2028     /*
2029      * While with constant-rate TSCs the scale factor can be shared, when TSCs
2030      * are not marked as 'reliable', re-sync during rendezvous.
2031      */
2032     if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
2033          !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
2034         time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous;
2035 
2036     return 0;
2037 }
2038 __initcall(verify_tsc_reliability);
2039 
2040 /* Late init function (after interrupts are enabled). */
init_xen_time(void)2041 int __init init_xen_time(void)
2042 {
2043     tsc_check_writability();
2044 
2045     open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
2046 
2047     /* NB. get_wallclock_time() can take over one second to execute. */
2048     do_settime(get_wallclock_time(), 0, NOW());
2049 
2050     /* Finish platform timer initialization. */
2051     try_platform_timer_tail(false);
2052 
2053     /*
2054      * Setup space to track per-socket TSC_ADJUST values. Don't fiddle with
2055      * values if the TSC is not reported as invariant. Ignore allocation
2056      * failure here - most systems won't need any adjustment anyway.
2057      */
2058     if ( boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
2059          boot_cpu_has(X86_FEATURE_ITSC) )
2060         tsc_adjust = xzalloc_array(uint64_t, nr_sockets);
2061 
2062     return 0;
2063 }
2064 
2065 
2066 /* Early init function. */
early_time_init(void)2067 void __init early_time_init(void)
2068 {
2069     struct cpu_time *t = &this_cpu(cpu_time);
2070     u64 tmp;
2071 
2072     if ( boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
2073          boot_cpu_has(X86_FEATURE_ITSC) )
2074     {
2075         rdmsrl(MSR_IA32_TSC_ADJUST, tmp);
2076         if ( tmp )
2077         {
2078             printk(XENLOG_WARNING
2079                    "TSC ADJUST set to %lx on boot CPU - clearing\n", tmp);
2080             wrmsrl(MSR_IA32_TSC_ADJUST, 0);
2081             boot_tsc_stamp -= tmp;
2082         }
2083     }
2084 
2085     preinit_pit();
2086     tmp = init_platform_timer();
2087     plt_tsc.frequency = tmp;
2088 
2089     set_time_scale(&t->tsc_scale, tmp);
2090     t->stamp.local_tsc = boot_tsc_stamp;
2091 
2092     do_div(tmp, 1000);
2093     cpu_khz = (unsigned long)tmp;
2094     printk("Detected %lu.%03lu MHz processor.\n",
2095            cpu_khz / 1000, cpu_khz % 1000);
2096 
2097     setup_irq(0, 0, &irq0);
2098 }
2099 
2100 /* keep pit enabled for pit_broadcast working while cpuidle enabled */
_disable_pit_irq(void (* hpet_broadcast_setup)(void))2101 static int _disable_pit_irq(void(*hpet_broadcast_setup)(void))
2102 {
2103     int ret = 1;
2104 
2105     if ( using_pit || !cpu_has_apic )
2106         return -1;
2107 
2108     /*
2109      * If we do not rely on PIT CH0 then we can use HPET for one-shot timer
2110      * emulation when entering deep C states.
2111      * XXX dom0 may rely on RTC interrupt delivery, so only enable
2112      * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
2113      */
2114     if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_ARAT) )
2115     {
2116         hpet_broadcast_setup();
2117         if ( !hpet_broadcast_is_available() )
2118         {
2119             if ( xen_cpuidle > 0 )
2120             {
2121                 printk("%ps() failed, turning to PIT broadcast\n",
2122                        hpet_broadcast_setup);
2123                 return -1;
2124             }
2125             ret = 0;
2126         }
2127     }
2128 
2129     /* Disable PIT CH0 timer interrupt. */
2130     outb_p(0x30, PIT_MODE);
2131     outb_p(0, PIT_CH0);
2132     outb_p(0, PIT_CH0);
2133 
2134     return ret;
2135 }
2136 
disable_pit_irq(void)2137 static int __init disable_pit_irq(void)
2138 {
2139     if ( !_disable_pit_irq(hpet_broadcast_init) )
2140     {
2141         xen_cpuidle = 0;
2142         printk("CPUIDLE: disabled due to no HPET. "
2143                "Force enable with 'cpuidle'.\n");
2144     }
2145 
2146     return 0;
2147 }
2148 __initcall(disable_pit_irq);
2149 
pit_broadcast_enter(void)2150 void pit_broadcast_enter(void)
2151 {
2152     cpumask_set_cpu(smp_processor_id(), &pit_broadcast_mask);
2153 }
2154 
pit_broadcast_exit(void)2155 void pit_broadcast_exit(void)
2156 {
2157     int cpu = smp_processor_id();
2158 
2159     if ( cpumask_test_and_clear_cpu(cpu, &pit_broadcast_mask) )
2160         reprogram_timer(this_cpu(timer_deadline));
2161 }
2162 
pit_broadcast_is_available(void)2163 int pit_broadcast_is_available(void)
2164 {
2165     return cpuidle_using_deep_cstate();
2166 }
2167 
send_timer_event(struct vcpu * v)2168 void send_timer_event(struct vcpu *v)
2169 {
2170     send_guest_vcpu_virq(v, VIRQ_TIMER);
2171 }
2172 
2173 /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */
2174 static long cmos_utc_offset; /* in seconds */
2175 
time_suspend(void)2176 int time_suspend(void)
2177 {
2178     if ( smp_processor_id() == 0 )
2179     {
2180         cmos_utc_offset = -get_wallclock_time();
2181         cmos_utc_offset += get_sec();
2182         kill_timer(&calibration_timer);
2183 
2184         /* Sync platform timer stamps. */
2185         platform_time_calibration();
2186     }
2187 
2188     /* Better to cancel calibration timer for accuracy. */
2189     clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));
2190 
2191     return 0;
2192 }
2193 
time_resume(void)2194 int time_resume(void)
2195 {
2196     preinit_pit();
2197 
2198     resume_platform_timer();
2199 
2200     if ( !_disable_pit_irq(hpet_broadcast_resume) )
2201         BUG();
2202 
2203     init_percpu_time();
2204 
2205     set_timer(&calibration_timer, NOW() + EPOCH);
2206 
2207     do_settime(get_wallclock_time() + cmos_utc_offset, 0, NOW());
2208 
2209     update_vcpu_system_time(current);
2210 
2211     update_domain_rtc();
2212 
2213     return 0;
2214 }
2215 
hwdom_pit_access(struct ioreq * ioreq)2216 int hwdom_pit_access(struct ioreq *ioreq)
2217 {
2218     /* Is Xen using Channel 2? Then disallow direct dom0 access. */
2219     if ( using_pit )
2220         return 0;
2221 
2222     switch ( ioreq->addr )
2223     {
2224     case PIT_CH2:
2225         if ( ioreq->dir == IOREQ_READ )
2226             ioreq->data = inb(PIT_CH2);
2227         else
2228             outb(ioreq->data, PIT_CH2);
2229         return 1;
2230 
2231     case PIT_MODE:
2232         if ( ioreq->dir == IOREQ_READ )
2233             return 0; /* urk! */
2234         switch ( ioreq->data & 0xc0 )
2235         {
2236         case 0xc0: /* Read Back */
2237             if ( ioreq->data & 0x08 )    /* Select Channel 2? */
2238                 outb(ioreq->data & 0xf8, PIT_MODE);
2239             if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */
2240                 return 1; /* no - we're done */
2241             /* Filter Channel 2 and reserved bit 0. */
2242             ioreq->data &= ~0x09;
2243             return 0; /* emulate ch0/1 readback */
2244         case 0x80: /* Select Counter 2 */
2245             outb(ioreq->data, PIT_MODE);
2246             return 1;
2247         }
2248         break;
2249 
2250     case 0x61:
2251         if ( ioreq->dir == IOREQ_READ )
2252             ioreq->data = inb(0x61);
2253         else
2254             outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61);
2255         return 1;
2256     }
2257 
2258     return 0;
2259 }
2260 
2261 /*
2262  * PV SoftTSC Emulation.
2263  */
2264 
2265 /*
2266  * tsc=unstable: Override all tests; assume TSC is unreliable.
2267  * tsc=skewed: Assume TSCs are individually reliable, but skewed across CPUs.
2268  * tsc=stable:socket: Assume TSCs are reliable across sockets.
2269  */
tsc_parse(const char * s)2270 static int __init tsc_parse(const char *s)
2271 {
2272     if ( !strcmp(s, "unstable") )
2273     {
2274         setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
2275         setup_clear_cpu_cap(X86_FEATURE_NONSTOP_TSC);
2276         setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
2277     }
2278     else if ( !strcmp(s, "skewed") )
2279         setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
2280     else if ( !strcmp(s, "stable:socket") )
2281         tsc_flags |= TSC_RELIABLE_SOCKET;
2282     else
2283         return -EINVAL;
2284 
2285     return 0;
2286 }
2287 custom_param("tsc", tsc_parse);
2288 
gtime_to_gtsc(struct domain * d,u64 time)2289 u64 gtime_to_gtsc(struct domain *d, u64 time)
2290 {
2291     if ( !is_hvm_domain(d) )
2292     {
2293         if ( time < d->arch.vtsc_offset )
2294             return -scale_delta(d->arch.vtsc_offset - time,
2295                                 &d->arch.ns_to_vtsc);
2296         time -= d->arch.vtsc_offset;
2297     }
2298     return scale_delta(time, &d->arch.ns_to_vtsc);
2299 }
2300 
gtsc_to_gtime(struct domain * d,u64 tsc)2301 u64 gtsc_to_gtime(struct domain *d, u64 tsc)
2302 {
2303     u64 time = scale_delta(tsc, &d->arch.vtsc_to_ns);
2304 
2305     if ( !is_hvm_domain(d) )
2306         time += d->arch.vtsc_offset;
2307     return time;
2308 }
2309 
pv_soft_rdtsc(const struct vcpu * v,const struct cpu_user_regs * regs)2310 uint64_t pv_soft_rdtsc(const struct vcpu *v, const struct cpu_user_regs *regs)
2311 {
2312     s_time_t old, new, now = get_s_time();
2313     struct domain *d = v->domain;
2314 
2315     do {
2316         old = d->arch.vtsc_last;
2317         new = now > d->arch.vtsc_last ? now : old + 1;
2318     } while ( cmpxchg(&d->arch.vtsc_last, old, new) != old );
2319 
2320     return gtime_to_gtsc(d, new);
2321 }
2322 
clocksource_is_tsc(void)2323 bool clocksource_is_tsc(void)
2324 {
2325     return plt_src.read_counter == read_tsc;
2326 }
2327 
host_tsc_is_safe(void)2328 int host_tsc_is_safe(void)
2329 {
2330     return boot_cpu_has(X86_FEATURE_TSC_RELIABLE);
2331 }
2332 
2333 /*
2334  * called to collect tsc-related data only for save file or live
2335  * migrate; called after last rdtsc is done on this incarnation
2336  */
tsc_get_info(struct domain * d,uint32_t * tsc_mode,uint64_t * elapsed_nsec,uint32_t * gtsc_khz,uint32_t * incarnation)2337 void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
2338                   uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
2339                   uint32_t *incarnation)
2340 {
2341     bool enable_tsc_scaling = is_hvm_domain(d) &&
2342                               hvm_tsc_scaling_supported && !d->arch.vtsc;
2343 
2344     *incarnation = d->arch.incarnation;
2345     *tsc_mode = d->arch.tsc_mode;
2346 
2347     switch ( *tsc_mode )
2348     {
2349         uint64_t tsc;
2350 
2351     case TSC_MODE_NEVER_EMULATE:
2352         *elapsed_nsec = *gtsc_khz = 0;
2353         break;
2354     case TSC_MODE_DEFAULT:
2355         if ( d->arch.vtsc )
2356         {
2357     case TSC_MODE_ALWAYS_EMULATE:
2358             *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
2359             *gtsc_khz = d->arch.tsc_khz;
2360             break;
2361         }
2362         tsc = rdtsc();
2363         *elapsed_nsec = scale_delta(tsc, &d->arch.vtsc_to_ns);
2364         *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz : cpu_khz;
2365         break;
2366     }
2367 
2368     if ( (int64_t)*elapsed_nsec < 0 )
2369         *elapsed_nsec = 0;
2370 }
2371 
2372 /*
2373  * This may be called as many as three times for a domain, once when the
2374  * hypervisor creates the domain, once when the toolstack creates the
2375  * domain and, if restoring/migrating, once when saved/migrated values
2376  * are restored.  Care must be taken that, if multiple calls occur,
2377  * only the last "sticks" and all are completed before the guest executes
2378  * an rdtsc instruction
2379  */
tsc_set_info(struct domain * d,uint32_t tsc_mode,uint64_t elapsed_nsec,uint32_t gtsc_khz,uint32_t incarnation)2380 int tsc_set_info(struct domain *d,
2381                  uint32_t tsc_mode, uint64_t elapsed_nsec,
2382                  uint32_t gtsc_khz, uint32_t incarnation)
2383 {
2384     ASSERT(!is_system_domain(d));
2385 
2386     if ( is_pv_domain(d) && is_hardware_domain(d) )
2387     {
2388         d->arch.vtsc = 0;
2389         return 0;
2390     }
2391 
2392     switch ( tsc_mode )
2393     {
2394     case TSC_MODE_DEFAULT:
2395     case TSC_MODE_ALWAYS_EMULATE:
2396         d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
2397         d->arch.tsc_khz = gtsc_khz ?: cpu_khz;
2398         set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000);
2399 
2400         /*
2401          * In default mode use native TSC if the host has safe TSC and
2402          * host and guest frequencies are the same (either "naturally" or
2403          * - for HVM/PVH - via TSC scaling).
2404          * When a guest is created, gtsc_khz is passed in as zero, making
2405          * d->arch.tsc_khz == cpu_khz. Thus no need to check incarnation.
2406          */
2407         if ( tsc_mode == TSC_MODE_DEFAULT && host_tsc_is_safe() &&
2408              (d->arch.tsc_khz == cpu_khz ||
2409               (is_hvm_domain(d) &&
2410                hvm_get_tsc_scaling_ratio(d->arch.tsc_khz))) )
2411         {
2412     case TSC_MODE_NEVER_EMULATE:
2413             d->arch.vtsc = 0;
2414             break;
2415         }
2416         d->arch.vtsc = 1;
2417         d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
2418         break;
2419 
2420     default:
2421         return -EINVAL;
2422     }
2423 
2424     d->arch.tsc_mode = tsc_mode;
2425 
2426     d->arch.incarnation = incarnation + 1;
2427     if ( is_hvm_domain(d) )
2428     {
2429         if ( hvm_tsc_scaling_supported && !d->arch.vtsc )
2430             d->arch.hvm.tsc_scaling_ratio =
2431                 hvm_get_tsc_scaling_ratio(d->arch.tsc_khz);
2432 
2433         hvm_set_rdtsc_exiting(d, d->arch.vtsc);
2434         if ( d->vcpu && d->vcpu[0] && incarnation == 0 )
2435         {
2436             /*
2437              * set_tsc_offset() is called from hvm_vcpu_initialise() before
2438              * tsc_set_info(). New vtsc mode may require recomputing TSC
2439              * offset.
2440              * We only need to do this for BSP during initial boot. APs will
2441              * call set_tsc_offset() later from hvm_vcpu_reset_state() and they
2442              * will sync their TSC to BSP's sync_tsc.
2443              */
2444             d->arch.hvm.sync_tsc = rdtsc();
2445             hvm_set_tsc_offset(d->vcpu[0],
2446                                d->vcpu[0]->arch.hvm.cache_tsc_offset,
2447                                d->arch.hvm.sync_tsc);
2448         }
2449     }
2450 
2451     recalculate_cpuid_policy(d);
2452 
2453     return 0;
2454 }
2455 
2456 /* vtsc may incur measurable performance degradation, diagnose with this */
dump_softtsc(unsigned char key)2457 static void dump_softtsc(unsigned char key)
2458 {
2459     struct domain *d;
2460     int domcnt = 0;
2461 
2462     tsc_check_reliability();
2463     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
2464         printk("TSC marked as reliable, "
2465                "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2466     else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
2467     {
2468         printk("TSC has constant rate, ");
2469         if ( max_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
2470             printk("no deep Cstates, passed warp test, deemed reliable, ");
2471         else
2472             printk("deep Cstates possible, so not reliable, ");
2473         printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2474     } else
2475         printk("TSC not marked as either constant or reliable, "
2476                "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2477 
2478     rcu_read_lock(&domlist_read_lock);
2479 
2480     for_each_domain ( d )
2481     {
2482         if ( is_hardware_domain(d) && d->arch.tsc_mode == TSC_MODE_DEFAULT )
2483             continue;
2484         printk("dom%u%s: mode=%d",d->domain_id,
2485                 is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
2486         if ( d->arch.vtsc_offset )
2487             printk(",ofs=%#"PRIx64, d->arch.vtsc_offset);
2488         if ( d->arch.tsc_khz )
2489             printk(",khz=%"PRIu32, d->arch.tsc_khz);
2490         if ( d->arch.incarnation )
2491             printk(",inc=%"PRIu32, d->arch.incarnation);
2492         printk("\n");
2493         domcnt++;
2494     }
2495 
2496     rcu_read_unlock(&domlist_read_lock);
2497 
2498     if ( !domcnt )
2499             printk("No domains have emulated TSC\n");
2500 }
2501 
setup_dump_softtsc(void)2502 static int __init setup_dump_softtsc(void)
2503 {
2504     register_keyhandler('s', dump_softtsc, "dump softtsc stats", 1);
2505     return 0;
2506 }
2507 __initcall(setup_dump_softtsc);
2508 
2509 /*
2510  * Local variables:
2511  * mode: C
2512  * c-file-style: "BSD"
2513  * c-basic-offset: 4
2514  * tab-width: 4
2515  * indent-tabs-mode: nil
2516  * End:
2517  */
2518