1 /******************************************************************************
2 * arch/x86/time.c
3 *
4 * Per-CPU time calibration and management.
5 *
6 * Copyright (c) 2002-2005, K A Fraser
7 *
8 * Portions from Linux are:
9 * Copyright (c) 1991, 1992, 1995 Linus Torvalds
10 */
11
12 #include <xen/errno.h>
13 #include <xen/event.h>
14 #include <xen/sched.h>
15 #include <xen/lib.h>
16 #include <xen/init.h>
17 #include <xen/param.h>
18 #include <xen/time.h>
19 #include <xen/timer.h>
20 #include <xen/smp.h>
21 #include <xen/irq.h>
22 #include <xen/pci_ids.h>
23 #include <xen/softirq.h>
24 #include <xen/efi.h>
25 #include <xen/cpuidle.h>
26 #include <xen/symbols.h>
27 #include <xen/keyhandler.h>
28 #include <xen/guest_access.h>
29 #include <asm/io.h>
30 #include <asm/iocap.h>
31 #include <asm/msr.h>
32 #include <asm/mpspec.h>
33 #include <asm/processor.h>
34 #include <asm/fixmap.h>
35 #include <asm/guest.h>
36 #include <asm/mc146818rtc.h>
37 #include <asm/div64.h>
38 #include <asm/acpi.h>
39 #include <asm/hpet.h>
40 #include <io_ports.h>
41 #include <asm/setup.h> /* for early_time_init */
42 #include <public/arch-x86/cpuid.h>
43
44 /* opt_clocksource: Force clocksource to one of: pit, hpet, acpi. */
45 static char __initdata opt_clocksource[10];
46 string_param("clocksource", opt_clocksource);
47
48 unsigned long __read_mostly cpu_khz; /* CPU clock frequency in kHz. */
49 DEFINE_SPINLOCK(rtc_lock);
50 unsigned long pit0_ticks;
51
52 struct cpu_time_stamp {
53 u64 local_tsc;
54 s_time_t local_stime;
55 s_time_t master_stime;
56 };
57
58 struct cpu_time {
59 struct cpu_time_stamp stamp;
60 struct time_scale tsc_scale;
61 };
62
63 struct platform_timesource {
64 char *id;
65 char *name;
66 u64 frequency;
67 u64 (*read_counter)(void);
68 s64 (*init)(struct platform_timesource *);
69 void (*resume)(struct platform_timesource *);
70 int counter_bits;
71 };
72
73 static DEFINE_PER_CPU(struct cpu_time, cpu_time);
74
75 /* Calibrate all CPUs to platform timer every EPOCH. */
76 #define EPOCH MILLISECS(1000)
77 static struct timer calibration_timer;
78
79 /*
80 * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
81 * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
82 * softirq handling will happen in time.
83 *
84 * The pit_lock protects the 16- and 32-bit stamp fields as well as the
85 */
86 static DEFINE_SPINLOCK(pit_lock);
87 static u16 pit_stamp16;
88 static u32 pit_stamp32;
89 static bool __read_mostly using_pit;
90
91 /* Boot timestamp, filled in head.S */
92 u64 __initdata boot_tsc_stamp;
93
94 /* Per-socket TSC_ADJUST values, for secondary cores/threads to sync to. */
95 static uint64_t *__read_mostly tsc_adjust;
96
97 /*
98 * 32-bit division of integer dividend and integer divisor yielding
99 * 32-bit fractional quotient.
100 */
div_frac(u32 dividend,u32 divisor)101 static inline u32 div_frac(u32 dividend, u32 divisor)
102 {
103 u32 quotient, remainder;
104 ASSERT(dividend < divisor);
105 asm (
106 "divl %4"
107 : "=a" (quotient), "=d" (remainder)
108 : "0" (0), "1" (dividend), "r" (divisor) );
109 return quotient;
110 }
111
112 /*
113 * 32-bit multiplication of multiplicand and fractional multiplier
114 * yielding 32-bit product (radix point at same position as in multiplicand).
115 */
mul_frac(u32 multiplicand,u32 multiplier)116 static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
117 {
118 u32 product_int, product_frac;
119 asm (
120 "mul %3"
121 : "=a" (product_frac), "=d" (product_int)
122 : "0" (multiplicand), "r" (multiplier) );
123 return product_int;
124 }
125
126 /*
127 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
128 * yielding a 64-bit result.
129 */
scale_delta(u64 delta,const struct time_scale * scale)130 u64 scale_delta(u64 delta, const struct time_scale *scale)
131 {
132 u64 product;
133
134 if ( scale->shift < 0 )
135 delta >>= -scale->shift;
136 else
137 delta <<= scale->shift;
138
139 asm (
140 "mulq %2 ; shrd $32,%1,%0"
141 : "=a" (product), "=d" (delta)
142 : "rm" (delta), "0" ((u64)scale->mul_frac) );
143
144 return product;
145 }
146
147 #define _TS_MUL_FRAC_IDENTITY 0x80000000UL
148
149 /* Compute the reciprocal of the given time_scale. */
scale_reciprocal(struct time_scale scale)150 static inline struct time_scale scale_reciprocal(struct time_scale scale)
151 {
152 struct time_scale reciprocal;
153 u32 dividend;
154
155 ASSERT(scale.mul_frac != 0);
156 dividend = _TS_MUL_FRAC_IDENTITY;
157 reciprocal.shift = 1 - scale.shift;
158 while ( unlikely(dividend >= scale.mul_frac) )
159 {
160 dividend >>= 1;
161 reciprocal.shift++;
162 }
163
164 asm (
165 "divl %4"
166 : "=a" (reciprocal.mul_frac), "=d" (dividend)
167 : "0" (0), "1" (dividend), "r" (scale.mul_frac) );
168
169 return reciprocal;
170 }
171
172 /*
173 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
174 * IPIs in place of local APIC timers
175 */
176 static cpumask_t pit_broadcast_mask;
177
smp_send_timer_broadcast_ipi(void)178 static void smp_send_timer_broadcast_ipi(void)
179 {
180 int cpu = smp_processor_id();
181 cpumask_t mask;
182
183 cpumask_and(&mask, &cpu_online_map, &pit_broadcast_mask);
184
185 if ( cpumask_test_cpu(cpu, &mask) )
186 {
187 __cpumask_clear_cpu(cpu, &mask);
188 raise_softirq(TIMER_SOFTIRQ);
189 }
190
191 if ( !cpumask_empty(&mask) )
192 {
193 cpumask_raise_softirq(&mask, TIMER_SOFTIRQ);
194 }
195 }
196
timer_interrupt(int irq,void * dev_id,struct cpu_user_regs * regs)197 static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
198 {
199 ASSERT(local_irq_is_enabled());
200
201 if ( hpet_legacy_irq_tick() )
202 return;
203
204 /* Only for start-of-day interruopt tests in io_apic.c. */
205 pit0_ticks++;
206
207 /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
208 if ( !cpu_has_apic )
209 raise_softirq(TIMER_SOFTIRQ);
210
211 if ( xen_cpuidle )
212 smp_send_timer_broadcast_ipi();
213
214 /* Emulate a 32-bit PIT counter. */
215 if ( using_pit )
216 {
217 u16 count;
218
219 spin_lock_irq(&pit_lock);
220
221 outb(0x80, PIT_MODE);
222 count = inb(PIT_CH2);
223 count |= inb(PIT_CH2) << 8;
224
225 pit_stamp32 += (u16)(pit_stamp16 - count);
226 pit_stamp16 = count;
227
228 spin_unlock_irq(&pit_lock);
229 }
230 }
231
232 static struct irqaction __read_mostly irq0 = {
233 timer_interrupt, "timer", NULL
234 };
235
236 #define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */
237 #define CALIBRATE_FRAC 20 /* calibrate over 50ms */
238 #define CALIBRATE_VALUE(freq) (((freq) + CALIBRATE_FRAC / 2) / CALIBRATE_FRAC)
239
preinit_pit(void)240 static void preinit_pit(void)
241 {
242 /* Set PIT channel 0 to HZ Hz. */
243 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
244 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
245 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
246 outb(LATCH >> 8, PIT_CH0); /* MSB */
247 #undef LATCH
248 }
249
set_time_scale(struct time_scale * ts,u64 ticks_per_sec)250 void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
251 {
252 u64 tps64 = ticks_per_sec;
253 u32 tps32;
254 int shift = 0;
255
256 ASSERT(tps64 != 0);
257
258 while ( tps64 > (MILLISECS(1000)*2) )
259 {
260 tps64 >>= 1;
261 shift--;
262 }
263
264 tps32 = (u32)tps64;
265 while ( tps32 <= (u32)MILLISECS(1000) )
266 {
267 tps32 <<= 1;
268 shift++;
269 }
270
271 ts->mul_frac = div_frac(MILLISECS(1000), tps32);
272 ts->shift = shift;
273 }
274
freq_string(u64 freq)275 static char *freq_string(u64 freq)
276 {
277 static char s[20];
278 unsigned int x, y;
279
280 if ( do_div(freq, 1000) > 500 )
281 ++freq;
282 y = (unsigned int)do_div(freq, 1000);
283 x = (unsigned int)freq;
284 snprintf(s, sizeof(s), "%u.%03uMHz", x, y);
285 return s;
286 }
287
288 /************************************************************
289 * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
290 */
291
read_pit_count(void)292 static u64 read_pit_count(void)
293 {
294 u16 count16;
295 u32 count32;
296 unsigned long flags;
297
298 spin_lock_irqsave(&pit_lock, flags);
299
300 outb(0x80, PIT_MODE);
301 count16 = inb(PIT_CH2);
302 count16 |= inb(PIT_CH2) << 8;
303
304 count32 = pit_stamp32 + (u16)(pit_stamp16 - count16);
305
306 spin_unlock_irqrestore(&pit_lock, flags);
307
308 return count32;
309 }
310
init_pit(struct platform_timesource * pts)311 static s64 __init init_pit(struct platform_timesource *pts)
312 {
313 u8 portb = inb(0x61);
314 u64 start, end;
315 unsigned long count;
316
317 using_pit = true;
318
319 /* Set the Gate high, disable speaker. */
320 outb((portb & ~0x02) | 0x01, 0x61);
321
322 /*
323 * Now let's take care of CTC channel 2: mode 0, (interrupt on
324 * terminal count mode), binary count, load CALIBRATE_LATCH count,
325 * (LSB and MSB) to begin countdown.
326 */
327 #define CALIBRATE_LATCH CALIBRATE_VALUE(CLOCK_TICK_RATE)
328 outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
329 outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
330 outb(CALIBRATE_LATCH >> 8, PIT_CH2); /* MSB of count */
331 #undef CALIBRATE_LATCH
332
333 start = rdtsc_ordered();
334 for ( count = 0; !(inb(0x61) & 0x20); ++count )
335 continue;
336 end = rdtsc_ordered();
337
338 /* Set the Gate low, disable speaker. */
339 outb(portb & ~0x03, 0x61);
340
341 /* Error if the CTC doesn't behave itself. */
342 if ( count == 0 )
343 return 0;
344
345 return (end - start) * CALIBRATE_FRAC;
346 }
347
resume_pit(struct platform_timesource * pts)348 static void resume_pit(struct platform_timesource *pts)
349 {
350 /* Set CTC channel 2 to mode 0 again; initial value does not matter. */
351 outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
352 outb(0, PIT_CH2); /* LSB of count */
353 outb(0, PIT_CH2); /* MSB of count */
354 }
355
356 static struct platform_timesource __initdata plt_pit =
357 {
358 .id = "pit",
359 .name = "PIT",
360 .frequency = CLOCK_TICK_RATE,
361 .read_counter = read_pit_count,
362 .counter_bits = 32,
363 .init = init_pit,
364 .resume = resume_pit,
365 };
366
367 /************************************************************
368 * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
369 */
370
read_hpet_count(void)371 static u64 read_hpet_count(void)
372 {
373 return hpet_read32(HPET_COUNTER);
374 }
375
init_hpet(struct platform_timesource * pts)376 static int64_t __init init_hpet(struct platform_timesource *pts)
377 {
378 uint64_t hpet_rate, start;
379 uint32_t count, target;
380
381 if ( hpet_address && strcmp(opt_clocksource, pts->id) &&
382 cpuidle_using_deep_cstate() )
383 {
384 if ( pci_conf_read16(PCI_SBDF(0, 0, 0x1f, 0),
385 PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL )
386 switch ( pci_conf_read16(PCI_SBDF(0, 0, 0x1f, 0), PCI_DEVICE_ID) )
387 {
388 /* HPET on Bay Trail platforms will halt in deep C states. */
389 case 0x0f1c:
390 /* HPET on Cherry Trail platforms will halt in deep C states. */
391 case 0x229c:
392 hpet_address = 0;
393 break;
394 }
395
396 /*
397 * Some Coffee Lake platforms have a skewed HPET timer once the SoCs
398 * entered PC10.
399 */
400 if ( pci_conf_read16(PCI_SBDF(0, 0, 0, 0),
401 PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL &&
402 pci_conf_read16(PCI_SBDF(0, 0, 0, 0),
403 PCI_DEVICE_ID) == 0x3ec4 )
404 hpet_address = 0;
405
406 if ( !hpet_address )
407 printk("Disabling HPET for being unreliable\n");
408 }
409
410 if ( (hpet_rate = hpet_setup()) == 0 )
411 return 0;
412
413 pts->frequency = hpet_rate;
414
415 count = hpet_read32(HPET_COUNTER);
416 start = rdtsc_ordered();
417 target = count + CALIBRATE_VALUE(hpet_rate);
418 if ( target < count )
419 while ( hpet_read32(HPET_COUNTER) >= count )
420 continue;
421 while ( hpet_read32(HPET_COUNTER) < target )
422 continue;
423
424 return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
425 }
426
resume_hpet(struct platform_timesource * pts)427 static void resume_hpet(struct platform_timesource *pts)
428 {
429 hpet_resume(NULL);
430 }
431
432 static struct platform_timesource __initdata plt_hpet =
433 {
434 .id = "hpet",
435 .name = "HPET",
436 .read_counter = read_hpet_count,
437 .counter_bits = 32,
438 .init = init_hpet,
439 .resume = resume_hpet
440 };
441
442 /************************************************************
443 * PLATFORM TIMER 3: ACPI PM TIMER
444 */
445
446 u32 __read_mostly pmtmr_ioport;
447 unsigned int __initdata pmtmr_width;
448
449 /* ACPI PM timer ticks at 3.579545 MHz. */
450 #define ACPI_PM_FREQUENCY 3579545
451
read_pmtimer_count(void)452 static u64 read_pmtimer_count(void)
453 {
454 return inl(pmtmr_ioport);
455 }
456
init_pmtimer(struct platform_timesource * pts)457 static s64 __init init_pmtimer(struct platform_timesource *pts)
458 {
459 u64 start;
460 u32 count, target, mask;
461
462 if ( !pmtmr_ioport || (pmtmr_width != 24 && pmtmr_width != 32) )
463 return 0;
464
465 pts->counter_bits = pmtmr_width;
466 mask = 0xffffffff >> (32 - pmtmr_width);
467
468 count = inl(pmtmr_ioport) & mask;
469 start = rdtsc_ordered();
470 target = count + CALIBRATE_VALUE(ACPI_PM_FREQUENCY);
471 if ( target < count )
472 while ( (inl(pmtmr_ioport) & mask) >= count )
473 continue;
474 while ( (inl(pmtmr_ioport) & mask) < target )
475 continue;
476
477 return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
478 }
479
480 static struct platform_timesource __initdata plt_pmtimer =
481 {
482 .id = "acpi",
483 .name = "ACPI PM Timer",
484 .frequency = ACPI_PM_FREQUENCY,
485 .read_counter = read_pmtimer_count,
486 .init = init_pmtimer
487 };
488
489 static struct time_scale __read_mostly pmt_scale;
490 static struct time_scale __read_mostly pmt_scale_r;
491
init_pmtmr_scale(void)492 static __init int init_pmtmr_scale(void)
493 {
494 set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY);
495 pmt_scale_r = scale_reciprocal(pmt_scale);
496 return 0;
497 }
498 __initcall(init_pmtmr_scale);
499
acpi_pm_tick_to_ns(uint64_t ticks)500 uint64_t acpi_pm_tick_to_ns(uint64_t ticks)
501 {
502 return scale_delta(ticks, &pmt_scale);
503 }
504
ns_to_acpi_pm_tick(uint64_t ns)505 uint64_t ns_to_acpi_pm_tick(uint64_t ns)
506 {
507 return scale_delta(ns, &pmt_scale_r);
508 }
509
510 /************************************************************
511 * PLATFORM TIMER 4: TSC
512 */
513 static unsigned int __initdata tsc_flags;
514
515 /* TSC is reliable across sockets */
516 #define TSC_RELIABLE_SOCKET (1 << 0)
517
518 /*
519 * Called in verify_tsc_reliability() under reliable TSC conditions
520 * thus reusing all the checks already performed there.
521 */
init_tsc(struct platform_timesource * pts)522 static s64 __init init_tsc(struct platform_timesource *pts)
523 {
524 u64 ret = pts->frequency;
525
526 if ( nr_cpu_ids != num_present_cpus() )
527 {
528 printk(XENLOG_WARNING "TSC: CPU Hotplug intended\n");
529 ret = 0;
530 }
531
532 if ( nr_sockets > 1 && !(tsc_flags & TSC_RELIABLE_SOCKET) )
533 {
534 printk(XENLOG_WARNING "TSC: Not invariant across sockets\n");
535 ret = 0;
536 }
537
538 if ( !ret )
539 printk(XENLOG_DEBUG "TSC: Not setting it as clocksource\n");
540
541 return ret;
542 }
543
read_tsc(void)544 static u64 read_tsc(void)
545 {
546 return rdtsc_ordered();
547 }
548
549 static struct platform_timesource __initdata plt_tsc =
550 {
551 .id = "tsc",
552 .name = "TSC",
553 .read_counter = read_tsc,
554 /*
555 * Calculations for platform timer overflow assume u64 boundary.
556 * Hence we set to less than 64, such that the TSC wraparound is
557 * correctly checked and handled.
558 */
559 .counter_bits = 63,
560 .init = init_tsc,
561 };
562
563 #ifdef CONFIG_XEN_GUEST
564 /************************************************************
565 * PLATFORM TIMER 5: XEN PV CLOCK SOURCE
566 *
567 * Xen clock source is a variant of TSC source.
568 */
569 static uint64_t xen_timer_last;
570
xen_timer_cpu_frequency(void)571 static uint64_t xen_timer_cpu_frequency(void)
572 {
573 struct vcpu_time_info *info = &this_cpu(vcpu_info)->time;
574 uint64_t freq;
575
576 freq = 1000000000ULL << 32;
577 do_div(freq, info->tsc_to_system_mul);
578 if ( info->tsc_shift < 0 )
579 freq <<= -info->tsc_shift;
580 else
581 freq >>= info->tsc_shift;
582
583 return freq;
584 }
585
init_xen_timer(struct platform_timesource * pts)586 static int64_t __init init_xen_timer(struct platform_timesource *pts)
587 {
588 if ( !xen_guest )
589 return 0;
590
591 return xen_timer_cpu_frequency();
592 }
593
read_cycle(const struct vcpu_time_info * info,uint64_t tsc)594 static always_inline uint64_t read_cycle(const struct vcpu_time_info *info,
595 uint64_t tsc)
596 {
597 uint64_t delta = tsc - info->tsc_timestamp;
598 struct time_scale ts = {
599 .shift = info->tsc_shift,
600 .mul_frac = info->tsc_to_system_mul,
601 };
602 uint64_t offset = scale_delta(delta, &ts);
603
604 return info->system_time + offset;
605 }
606
read_xen_timer(void)607 static uint64_t read_xen_timer(void)
608 {
609 struct vcpu_time_info *info = &this_cpu(vcpu_info)->time;
610 uint32_t version;
611 uint64_t ret;
612 uint64_t last;
613
614 do {
615 version = info->version & ~1;
616 /* Make sure version is read before the data */
617 smp_rmb();
618
619 ret = read_cycle(info, rdtsc_ordered());
620 /* Ignore fancy flags for now */
621
622 /* Make sure version is reread after the data */
623 smp_rmb();
624 } while ( unlikely(version != info->version) );
625
626 /* Maintain a monotonic global value */
627 do {
628 last = read_atomic(&xen_timer_last);
629 if ( ret < last )
630 return last;
631 } while ( unlikely(cmpxchg(&xen_timer_last, last, ret) != last) );
632
633 return ret;
634 }
635
resume_xen_timer(struct platform_timesource * pts)636 static void resume_xen_timer(struct platform_timesource *pts)
637 {
638 write_atomic(&xen_timer_last, 0);
639 }
640
641 static struct platform_timesource __initdata plt_xen_timer =
642 {
643 .id = "xen",
644 .name = "XEN PV CLOCK",
645 .frequency = 1000000000ULL,
646 .read_counter = read_xen_timer,
647 .init = init_xen_timer,
648 .resume = resume_xen_timer,
649 .counter_bits = 63,
650 };
651 #endif
652
653 #ifdef CONFIG_HYPERV_GUEST
654 /************************************************************
655 * HYPER-V REFERENCE TSC
656 */
657 #include <asm/guest/hyperv-tlfs.h>
658
659 static struct ms_hyperv_tsc_page *hyperv_tsc;
660 static struct page_info *hyperv_tsc_page;
661
init_hyperv_timer(struct platform_timesource * pts)662 static int64_t __init init_hyperv_timer(struct platform_timesource *pts)
663 {
664 paddr_t maddr;
665 uint64_t tsc_msr, freq;
666
667 if ( !(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) ||
668 !(ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS) )
669 return 0;
670
671 hyperv_tsc_page = alloc_domheap_page(NULL, 0);
672 if ( !hyperv_tsc_page )
673 return 0;
674
675 hyperv_tsc = __map_domain_page_global(hyperv_tsc_page);
676 if ( !hyperv_tsc )
677 {
678 free_domheap_page(hyperv_tsc_page);
679 hyperv_tsc_page = NULL;
680 return 0;
681 }
682
683 maddr = page_to_maddr(hyperv_tsc_page);
684
685 /*
686 * Per Hyper-V TLFS:
687 * 1. Read existing MSR value
688 * 2. Preserve bits [11:1]
689 * 3. Set bits [63:12] to be guest physical address of tsc page
690 * 4. Set enabled bit (0)
691 * 5. Write back new MSR value
692 */
693 rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr);
694 tsc_msr &= 0xffe;
695 tsc_msr |= maddr | 1 /* enabled */;
696 wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr);
697
698 /* Get TSC frequency from Hyper-V */
699 rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
700 pts->frequency = freq;
701
702 return freq;
703 }
704
read_hyperv_timer(void)705 static uint64_t read_hyperv_timer(void)
706 {
707 uint64_t scale, ret, tsc;
708 int64_t offset;
709 uint32_t seq;
710 const struct ms_hyperv_tsc_page *tsc_page = hyperv_tsc;
711
712 do {
713 seq = tsc_page->tsc_sequence;
714
715 /* Seq 0 is special. It means the TSC enlightenment is not
716 * available at the moment. The reference time can only be
717 * obtained from the Reference Counter MSR.
718 */
719 if ( seq == 0 )
720 {
721 rdmsrl(HV_X64_MSR_TIME_REF_COUNT, ret);
722 return ret;
723 }
724
725 /* rdtsc_ordered already contains a load fence */
726 tsc = rdtsc_ordered();
727 scale = tsc_page->tsc_scale;
728 offset = tsc_page->tsc_offset;
729
730 smp_rmb();
731
732 } while ( tsc_page->tsc_sequence != seq );
733
734 return hv_scale_tsc(tsc, scale, offset);
735 }
736
737 static struct platform_timesource __initdata plt_hyperv_timer =
738 {
739 .id = "hyperv",
740 .name = "HYPER-V REFERENCE TSC",
741 .read_counter = read_hyperv_timer,
742 .init = init_hyperv_timer,
743 /* See TSC time source for why counter_bits is set to 63 */
744 .counter_bits = 63,
745 };
746 #endif
747
748 /************************************************************
749 * GENERIC PLATFORM TIMER INFRASTRUCTURE
750 */
751
752 /* details of chosen timesource */
753 static struct platform_timesource __read_mostly plt_src;
754 /* hardware-width mask */
755 static u64 __read_mostly plt_mask;
756 /* ns between calls to plt_overflow() */
757 static u64 __read_mostly plt_overflow_period;
758 /* scale: platform counter -> nanosecs */
759 static struct time_scale __read_mostly plt_scale;
760
761 /* Protected by platform_timer_lock. */
762 static DEFINE_SPINLOCK(platform_timer_lock);
763 static s_time_t stime_platform_stamp; /* System time at below platform time */
764 static u64 platform_timer_stamp; /* Platform time at above system time */
765 static u64 plt_stamp64; /* 64-bit platform counter stamp */
766 static u64 plt_stamp; /* hardware-width platform counter stamp */
767 static struct timer plt_overflow_timer;
768
__read_platform_stime(u64 platform_time)769 static s_time_t __read_platform_stime(u64 platform_time)
770 {
771 u64 diff = platform_time - platform_timer_stamp;
772 ASSERT(spin_is_locked(&platform_timer_lock));
773 return (stime_platform_stamp + scale_delta(diff, &plt_scale));
774 }
775
plt_overflow(void * unused)776 static void plt_overflow(void *unused)
777 {
778 int i;
779 u64 count;
780 s_time_t now, plt_now, plt_wrap;
781
782 spin_lock_irq(&platform_timer_lock);
783
784 count = plt_src.read_counter();
785 plt_stamp64 += (count - plt_stamp) & plt_mask;
786 plt_stamp = count;
787
788 now = NOW();
789 plt_wrap = __read_platform_stime(plt_stamp64);
790 for ( i = 0; i < 10; i++ )
791 {
792 plt_now = plt_wrap;
793 plt_wrap = __read_platform_stime(plt_stamp64 + plt_mask + 1);
794 if ( ABS(plt_wrap - now) > ABS(plt_now - now) )
795 break;
796 plt_stamp64 += plt_mask + 1;
797 }
798 if ( i != 0 )
799 {
800 static bool warned_once;
801
802 if ( !test_and_set_bool(warned_once) )
803 printk("Platform timer appears to have unexpectedly wrapped "
804 "%u%s times.\n", i, (i == 10) ? " or more" : "");
805 }
806
807 spin_unlock_irq(&platform_timer_lock);
808
809 set_timer(&plt_overflow_timer, NOW() + plt_overflow_period);
810 }
811
read_platform_stime(u64 * stamp)812 static s_time_t read_platform_stime(u64 *stamp)
813 {
814 u64 plt_counter, count;
815 s_time_t stime;
816
817 ASSERT(!local_irq_is_enabled());
818
819 spin_lock(&platform_timer_lock);
820 plt_counter = plt_src.read_counter();
821 count = plt_stamp64 + ((plt_counter - plt_stamp) & plt_mask);
822 stime = __read_platform_stime(count);
823 spin_unlock(&platform_timer_lock);
824
825 if ( unlikely(stamp) )
826 *stamp = plt_counter;
827
828 return stime;
829 }
830
platform_time_calibration(void)831 static void platform_time_calibration(void)
832 {
833 u64 count;
834 s_time_t stamp;
835 unsigned long flags;
836
837 spin_lock_irqsave(&platform_timer_lock, flags);
838 count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
839 stamp = __read_platform_stime(count);
840 stime_platform_stamp = stamp;
841 platform_timer_stamp = count;
842 spin_unlock_irqrestore(&platform_timer_lock, flags);
843 }
844
resume_platform_timer(void)845 static void resume_platform_timer(void)
846 {
847 /* Timer source can be reset when backing from S3 to S0 */
848 if ( plt_src.resume )
849 plt_src.resume(&plt_src);
850
851 plt_stamp64 = platform_timer_stamp;
852 plt_stamp = plt_src.read_counter();
853 }
854
reset_platform_timer(void)855 static void __init reset_platform_timer(void)
856 {
857 /* Deactivate any timers running */
858 kill_timer(&plt_overflow_timer);
859 kill_timer(&calibration_timer);
860
861 /* Reset counters and stamps */
862 spin_lock_irq(&platform_timer_lock);
863 plt_stamp = 0;
864 plt_stamp64 = 0;
865 platform_timer_stamp = 0;
866 stime_platform_stamp = 0;
867 spin_unlock_irq(&platform_timer_lock);
868 }
869
try_platform_timer(struct platform_timesource * pts)870 static s64 __init try_platform_timer(struct platform_timesource *pts)
871 {
872 s64 rc = pts->init(pts);
873
874 if ( rc <= 0 )
875 return rc;
876
877 /* We have a platform timesource already so reset it */
878 if ( plt_src.counter_bits != 0 )
879 reset_platform_timer();
880
881 plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
882
883 set_time_scale(&plt_scale, pts->frequency);
884
885 plt_overflow_period = scale_delta(
886 1ull << (pts->counter_bits - 1), &plt_scale);
887 plt_src = *pts;
888
889 return rc;
890 }
891
init_platform_timer(void)892 static u64 __init init_platform_timer(void)
893 {
894 static struct platform_timesource * __initdata plt_timers[] = {
895 #ifdef CONFIG_XEN_GUEST
896 &plt_xen_timer,
897 #endif
898 #ifdef CONFIG_HYPERV_GUEST
899 &plt_hyperv_timer,
900 #endif
901 &plt_hpet, &plt_pmtimer, &plt_pit
902 };
903
904 struct platform_timesource *pts = NULL;
905 unsigned int i;
906 s64 rc = -1;
907
908 /* clocksource=tsc is initialized via __initcalls (when CPUs are up). */
909 if ( (opt_clocksource[0] != '\0') && strcmp(opt_clocksource, "tsc") )
910 {
911 for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
912 {
913 pts = plt_timers[i];
914 if ( !strcmp(opt_clocksource, pts->id) )
915 {
916 rc = try_platform_timer(pts);
917 break;
918 }
919 }
920
921 if ( rc <= 0 )
922 printk("WARNING: %s clocksource '%s'.\n",
923 (rc == 0) ? "Could not initialise" : "Unrecognised",
924 opt_clocksource);
925 }
926
927 if ( rc <= 0 )
928 {
929 for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
930 {
931 pts = plt_timers[i];
932 if ( (rc = try_platform_timer(pts)) > 0 )
933 break;
934 }
935 }
936
937 if ( rc <= 0 )
938 panic("Unable to find usable platform timer\n");
939
940 printk("Platform timer is %s %s\n",
941 freq_string(pts->frequency), pts->name);
942
943 return rc;
944 }
945
stime2tsc(s_time_t stime)946 u64 stime2tsc(s_time_t stime)
947 {
948 struct cpu_time *t;
949 struct time_scale sys_to_tsc;
950 s_time_t stime_delta;
951
952 t = &this_cpu(cpu_time);
953 sys_to_tsc = scale_reciprocal(t->tsc_scale);
954
955 stime_delta = stime - t->stamp.local_stime;
956 if ( stime_delta < 0 )
957 stime_delta = 0;
958
959 return t->stamp.local_tsc + scale_delta(stime_delta, &sys_to_tsc);
960 }
961
cstate_restore_tsc(void)962 void cstate_restore_tsc(void)
963 {
964 struct cpu_time *t = &this_cpu(cpu_time);
965
966 if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
967 return;
968
969 t->stamp.master_stime = read_platform_stime(NULL);
970 t->stamp.local_tsc = stime2tsc(t->stamp.master_stime);
971 t->stamp.local_stime = t->stamp.master_stime;
972
973 write_tsc(t->stamp.local_tsc);
974 }
975
976 /***************************************************************************
977 * CMOS Timer functions
978 ***************************************************************************/
979
980 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
981 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
982 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
983 *
984 * [For the Julian calendar (which was used in Russia before 1917,
985 * Britain & colonies before 1752, anywhere else before 1582,
986 * and is still in use by some communities) leave out the
987 * -year/100+year/400 terms, and add 10.]
988 *
989 * This algorithm was first published by Gauss (I think).
990 *
991 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
992 * machines were long is 32-bit! (However, as time_t is signed, we
993 * will already get problems at other places on 2038-01-19 03:14:08)
994 */
995 unsigned long
mktime(unsigned int year,unsigned int mon,unsigned int day,unsigned int hour,unsigned int min,unsigned int sec)996 mktime (unsigned int year, unsigned int mon,
997 unsigned int day, unsigned int hour,
998 unsigned int min, unsigned int sec)
999 {
1000 /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */
1001 if ( 0 >= (int) (mon -= 2) )
1002 {
1003 mon += 12;
1004 year -= 1;
1005 }
1006
1007 return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+
1008 year*365 - 719499
1009 )*24 + hour /* now have hours */
1010 )*60 + min /* now have minutes */
1011 )*60 + sec; /* finally seconds */
1012 }
1013
1014 struct rtc_time {
1015 unsigned int year, mon, day, hour, min, sec;
1016 };
1017
__get_cmos_time(struct rtc_time * rtc)1018 static void __get_cmos_time(struct rtc_time *rtc)
1019 {
1020 rtc->sec = CMOS_READ(RTC_SECONDS);
1021 rtc->min = CMOS_READ(RTC_MINUTES);
1022 rtc->hour = CMOS_READ(RTC_HOURS);
1023 rtc->day = CMOS_READ(RTC_DAY_OF_MONTH);
1024 rtc->mon = CMOS_READ(RTC_MONTH);
1025 rtc->year = CMOS_READ(RTC_YEAR);
1026
1027 if ( RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) )
1028 {
1029 BCD_TO_BIN(rtc->sec);
1030 BCD_TO_BIN(rtc->min);
1031 BCD_TO_BIN(rtc->hour);
1032 BCD_TO_BIN(rtc->day);
1033 BCD_TO_BIN(rtc->mon);
1034 BCD_TO_BIN(rtc->year);
1035 }
1036
1037 if ( (rtc->year += 1900) < 1970 )
1038 rtc->year += 100;
1039 }
1040
get_cmos_time(void)1041 static unsigned long get_cmos_time(void)
1042 {
1043 unsigned long res, flags;
1044 struct rtc_time rtc;
1045 unsigned int seconds = 60;
1046 static bool __read_mostly cmos_rtc_probe;
1047 boolean_param("cmos-rtc-probe", cmos_rtc_probe);
1048
1049 if ( efi_enabled(EFI_RS) )
1050 {
1051 res = efi_get_time();
1052 if ( res )
1053 return res;
1054 }
1055
1056 if ( likely(!(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC)) )
1057 cmos_rtc_probe = false;
1058 else if ( system_state < SYS_STATE_smp_boot && !cmos_rtc_probe )
1059 panic("System with no CMOS RTC advertised must be booted from EFI"
1060 " (or with command line option \"cmos-rtc-probe\")\n");
1061
1062 for ( ; ; )
1063 {
1064 s_time_t start, t1, t2;
1065
1066 spin_lock_irqsave(&rtc_lock, flags);
1067
1068 /* read RTC exactly on falling edge of update flag */
1069 start = NOW();
1070 do { /* may take up to 1 second... */
1071 t1 = NOW() - start;
1072 } while ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
1073 t1 <= SECONDS(1) );
1074
1075 start = NOW();
1076 do { /* must try at least 2.228 ms */
1077 t2 = NOW() - start;
1078 } while ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
1079 t2 < MILLISECS(3) );
1080
1081 __get_cmos_time(&rtc);
1082
1083 spin_unlock_irqrestore(&rtc_lock, flags);
1084
1085 if ( likely(!cmos_rtc_probe) ||
1086 t1 > SECONDS(1) || t2 >= MILLISECS(3) ||
1087 rtc.sec >= 60 || rtc.min >= 60 || rtc.hour >= 24 ||
1088 !rtc.day || rtc.day > 31 ||
1089 !rtc.mon || rtc.mon > 12 )
1090 break;
1091
1092 if ( seconds < 60 )
1093 {
1094 if ( rtc.sec != seconds )
1095 cmos_rtc_probe = false;
1096 break;
1097 }
1098
1099 process_pending_softirqs();
1100
1101 seconds = rtc.sec;
1102 }
1103
1104 if ( unlikely(cmos_rtc_probe) )
1105 panic("No CMOS RTC found - system must be booted from EFI\n");
1106
1107 return mktime(rtc.year, rtc.mon, rtc.day, rtc.hour, rtc.min, rtc.sec);
1108 }
1109
1110 /* Helpers for guest accesses to the physical RTC. */
rtc_guest_read(unsigned int port)1111 unsigned int rtc_guest_read(unsigned int port)
1112 {
1113 const struct domain *currd = current->domain;
1114 unsigned long flags;
1115 unsigned int data = ~0;
1116
1117 switch ( port )
1118 {
1119 case RTC_PORT(0):
1120 /*
1121 * All PV domains (and PVH dom0) are allowed to read the latched value
1122 * of the first RTC port, as there's no access to the physical IO
1123 * ports.
1124 */
1125 data = currd->arch.cmos_idx;
1126 break;
1127
1128 case RTC_PORT(1):
1129 if ( !ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
1130 break;
1131 spin_lock_irqsave(&rtc_lock, flags);
1132 outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
1133 data = inb(RTC_PORT(1));
1134 spin_unlock_irqrestore(&rtc_lock, flags);
1135 break;
1136
1137 default:
1138 ASSERT_UNREACHABLE();
1139 }
1140
1141 return data;
1142 }
1143
rtc_guest_write(unsigned int port,unsigned int data)1144 void rtc_guest_write(unsigned int port, unsigned int data)
1145 {
1146 struct domain *currd = current->domain;
1147 unsigned long flags;
1148
1149 switch ( port )
1150 {
1151 typeof(pv_rtc_handler) hook;
1152
1153 case RTC_PORT(0):
1154 /*
1155 * All PV domains (and PVH dom0) are allowed to write to the latched
1156 * value of the first RTC port, as there's no access to the physical IO
1157 * ports.
1158 */
1159 currd->arch.cmos_idx = data;
1160 break;
1161
1162 case RTC_PORT(1):
1163 if ( !ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
1164 break;
1165
1166 hook = ACCESS_ONCE(pv_rtc_handler);
1167 if ( hook )
1168 hook(currd->arch.cmos_idx & 0x7f, data);
1169
1170 spin_lock_irqsave(&rtc_lock, flags);
1171 outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
1172 outb(data, RTC_PORT(1));
1173 spin_unlock_irqrestore(&rtc_lock, flags);
1174 break;
1175
1176 default:
1177 ASSERT_UNREACHABLE();
1178 }
1179 }
1180
get_wallclock_time(void)1181 static unsigned long get_wallclock_time(void)
1182 {
1183 #ifdef CONFIG_XEN_GUEST
1184 if ( xen_guest )
1185 {
1186 struct shared_info *sh_info = XEN_shared_info;
1187 uint32_t wc_version;
1188 uint64_t wc_sec;
1189
1190 do {
1191 wc_version = sh_info->wc_version & ~1;
1192 smp_rmb();
1193
1194 wc_sec = sh_info->wc_sec;
1195 smp_rmb();
1196 } while ( wc_version != sh_info->wc_version );
1197
1198 return wc_sec + read_xen_timer() / 1000000000;
1199 }
1200 #endif
1201
1202 return get_cmos_time();
1203 }
1204
1205 /***************************************************************************
1206 * System Time
1207 ***************************************************************************/
1208
get_s_time_fixed(u64 at_tsc)1209 s_time_t get_s_time_fixed(u64 at_tsc)
1210 {
1211 const struct cpu_time *t = &this_cpu(cpu_time);
1212 u64 tsc, delta;
1213
1214 if ( at_tsc )
1215 tsc = at_tsc;
1216 else
1217 tsc = rdtsc_ordered();
1218 delta = tsc - t->stamp.local_tsc;
1219 return t->stamp.local_stime + scale_delta(delta, &t->tsc_scale);
1220 }
1221
get_s_time()1222 s_time_t get_s_time()
1223 {
1224 return get_s_time_fixed(0);
1225 }
1226
tsc_ticks2ns(uint64_t ticks)1227 uint64_t tsc_ticks2ns(uint64_t ticks)
1228 {
1229 struct cpu_time *t = &this_cpu(cpu_time);
1230
1231 return scale_delta(ticks, &t->tsc_scale);
1232 }
1233
__update_vcpu_system_time(struct vcpu * v,int force)1234 static void __update_vcpu_system_time(struct vcpu *v, int force)
1235 {
1236 const struct cpu_time *t;
1237 struct vcpu_time_info *u, _u = {};
1238 struct domain *d = v->domain;
1239 s_time_t tsc_stamp;
1240
1241 if ( v->vcpu_info == NULL )
1242 return;
1243
1244 t = &this_cpu(cpu_time);
1245 u = &vcpu_info(v, time);
1246
1247 if ( d->arch.vtsc )
1248 {
1249 s_time_t stime = t->stamp.local_stime;
1250
1251 if ( is_hvm_domain(d) )
1252 {
1253 struct pl_time *pl = v->domain->arch.hvm.pl_time;
1254
1255 stime += pl->stime_offset + v->arch.hvm.stime_offset;
1256 if ( stime >= 0 )
1257 tsc_stamp = gtime_to_gtsc(d, stime);
1258 else
1259 tsc_stamp = -gtime_to_gtsc(d, -stime);
1260 }
1261 else
1262 tsc_stamp = gtime_to_gtsc(d, stime);
1263
1264 _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
1265 _u.tsc_shift = d->arch.vtsc_to_ns.shift;
1266 }
1267 else
1268 {
1269 if ( is_hvm_domain(d) && hvm_tsc_scaling_supported )
1270 {
1271 tsc_stamp = hvm_scale_tsc(d, t->stamp.local_tsc);
1272 _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
1273 _u.tsc_shift = d->arch.vtsc_to_ns.shift;
1274 }
1275 else
1276 {
1277 tsc_stamp = t->stamp.local_tsc;
1278 _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
1279 _u.tsc_shift = t->tsc_scale.shift;
1280 }
1281 }
1282
1283 _u.tsc_timestamp = tsc_stamp;
1284 _u.system_time = t->stamp.local_stime;
1285
1286 /*
1287 * It's expected that domains cope with this bit changing on every
1288 * pvclock read to check whether they can resort solely on this tuple
1289 * or if it further requires monotonicity checks with other vcpus.
1290 */
1291 if ( clocksource_is_tsc() )
1292 _u.flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
1293
1294 if ( is_hvm_domain(d) )
1295 _u.tsc_timestamp += v->arch.hvm.cache_tsc_offset;
1296
1297 /* Don't bother unless timestamp record has changed or we are forced. */
1298 _u.version = u->version; /* make versions match for memcmp test */
1299 if ( !force && !memcmp(u, &_u, sizeof(_u)) )
1300 return;
1301
1302 /* 1. Update guest kernel version. */
1303 _u.version = u->version = version_update_begin(u->version);
1304 smp_wmb();
1305 /* 2. Update all other guest kernel fields. */
1306 *u = _u;
1307 smp_wmb();
1308 /* 3. Update guest kernel version. */
1309 u->version = version_update_end(u->version);
1310
1311 if ( !update_secondary_system_time(v, &_u) && is_pv_domain(d) &&
1312 !is_pv_32bit_domain(d) && !(v->arch.flags & TF_kernel_mode) )
1313 v->arch.pv.pending_system_time = _u;
1314 }
1315
update_secondary_system_time(struct vcpu * v,struct vcpu_time_info * u)1316 bool update_secondary_system_time(struct vcpu *v,
1317 struct vcpu_time_info *u)
1318 {
1319 XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest;
1320 struct guest_memory_policy policy = { .nested_guest_mode = false };
1321
1322 if ( guest_handle_is_null(user_u) )
1323 return true;
1324
1325 update_guest_memory_policy(v, &policy);
1326
1327 /* 1. Update userspace version. */
1328 if ( __copy_field_to_guest(user_u, u, version) == sizeof(u->version) )
1329 {
1330 update_guest_memory_policy(v, &policy);
1331 return false;
1332 }
1333 smp_wmb();
1334 /* 2. Update all other userspace fields. */
1335 __copy_to_guest(user_u, u, 1);
1336 smp_wmb();
1337 /* 3. Update userspace version. */
1338 u->version = version_update_end(u->version);
1339 __copy_field_to_guest(user_u, u, version);
1340
1341 update_guest_memory_policy(v, &policy);
1342
1343 return true;
1344 }
1345
update_vcpu_system_time(struct vcpu * v)1346 void update_vcpu_system_time(struct vcpu *v)
1347 {
1348 __update_vcpu_system_time(v, 0);
1349 }
1350
force_update_vcpu_system_time(struct vcpu * v)1351 void force_update_vcpu_system_time(struct vcpu *v)
1352 {
1353 __update_vcpu_system_time(v, 1);
1354 }
1355
update_domain_rtc(void)1356 static void update_domain_rtc(void)
1357 {
1358 struct domain *d;
1359
1360 rcu_read_lock(&domlist_read_lock);
1361
1362 for_each_domain ( d )
1363 if ( is_hvm_domain(d) )
1364 rtc_update_clock(d);
1365
1366 rcu_read_unlock(&domlist_read_lock);
1367 }
1368
domain_set_time_offset(struct domain * d,int64_t time_offset_seconds)1369 void domain_set_time_offset(struct domain *d, int64_t time_offset_seconds)
1370 {
1371 d->time_offset.seconds = time_offset_seconds;
1372 d->time_offset.set = true;
1373 if ( is_hvm_domain(d) )
1374 rtc_update_clock(d);
1375 update_domain_wallclock_time(d);
1376 }
1377
cpu_frequency_change(u64 freq)1378 int cpu_frequency_change(u64 freq)
1379 {
1380 struct cpu_time *t = &this_cpu(cpu_time);
1381 u64 curr_tsc;
1382
1383 /* Sanity check: CPU frequency allegedly dropping below 1MHz? */
1384 if ( freq < 1000000u )
1385 {
1386 printk(XENLOG_WARNING "Rejecting CPU frequency change "
1387 "to %"PRIu64" Hz\n", freq);
1388 return -EINVAL;
1389 }
1390
1391 local_irq_disable();
1392 /* Platform time /first/, as we may be delayed by platform_timer_lock. */
1393 t->stamp.master_stime = read_platform_stime(NULL);
1394 curr_tsc = rdtsc_ordered();
1395 /* TSC-extrapolated time may be bogus after frequency change. */
1396 /*t->stamp.local_stime = get_s_time_fixed(curr_tsc);*/
1397 t->stamp.local_stime = t->stamp.master_stime;
1398 t->stamp.local_tsc = curr_tsc;
1399 set_time_scale(&t->tsc_scale, freq);
1400 local_irq_enable();
1401
1402 update_vcpu_system_time(current);
1403
1404 /* A full epoch should pass before we check for deviation. */
1405 if ( smp_processor_id() == 0 )
1406 {
1407 set_timer(&calibration_timer, NOW() + EPOCH);
1408 platform_time_calibration();
1409 }
1410
1411 return 0;
1412 }
1413
1414 /* Per-CPU communication between rendezvous IRQ and softirq handler. */
1415 static DEFINE_PER_CPU(struct cpu_time_stamp, cpu_calibration);
1416
1417 /* Softirq handler for per-CPU time calibration. */
local_time_calibration(void)1418 static void local_time_calibration(void)
1419 {
1420 struct cpu_time *t = &this_cpu(cpu_time);
1421 const struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1422
1423 /*
1424 * System (extrapolated from local and master oscillators) and TSC
1425 * timestamps, taken during this calibration and the previous one.
1426 */
1427 struct cpu_time_stamp prev, curr;
1428
1429 /*
1430 * System time and TSC ticks elapsed during the previous calibration
1431 * 'epoch'. These values are down-shifted to fit in 32 bits.
1432 */
1433 u64 stime_elapsed64, tsc_elapsed64;
1434 u32 stime_elapsed32, tsc_elapsed32;
1435
1436 /* Error correction to slow down a fast local clock. */
1437 u32 error_factor = 0;
1438
1439 /* Calculated TSC shift to ensure 32-bit scale multiplier. */
1440 int tsc_shift = 0;
1441
1442 /* The overall calibration scale multiplier. */
1443 u32 calibration_mul_frac;
1444
1445 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1446 {
1447 /* Atomically read cpu_calibration struct and write cpu_time struct. */
1448 local_irq_disable();
1449 t->stamp = *c;
1450 local_irq_enable();
1451 update_vcpu_system_time(current);
1452 goto out;
1453 }
1454
1455 prev = t->stamp;
1456
1457 /* Disabling IRQs ensures we atomically read cpu_calibration struct. */
1458 local_irq_disable();
1459 curr = *c;
1460 local_irq_enable();
1461
1462 #if 0
1463 printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n",
1464 smp_processor_id(), prev.local_tsc, prev.local_stime, prev.master_stime);
1465 printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64
1466 " -> %"PRId64"\n",
1467 smp_processor_id(), curr.local_tsc, curr.local_stime, curr.master_stime,
1468 curr.master_stime - curr.local_stime);
1469 #endif
1470
1471 /* Local time warps forward if it lags behind master time. */
1472 if ( curr.local_stime < curr.master_stime )
1473 curr.local_stime = curr.master_stime;
1474
1475 stime_elapsed64 = curr.master_stime - prev.master_stime;
1476 tsc_elapsed64 = curr.local_tsc - prev.local_tsc;
1477
1478 /*
1479 * Weirdness can happen if we lose sync with the platform timer.
1480 * We could be smarter here: resync platform timer with local timer?
1481 */
1482 if ( ((s64)stime_elapsed64 < (EPOCH / 2)) )
1483 goto out;
1484
1485 /*
1486 * Calculate error-correction factor. This only slows down a fast local
1487 * clock (slow clocks are warped forwards). The scale factor is clamped
1488 * to >= 0.5.
1489 */
1490 if ( curr.local_stime != curr.master_stime )
1491 {
1492 u64 local_stime_err = curr.local_stime - curr.master_stime;
1493
1494 if ( local_stime_err > EPOCH )
1495 local_stime_err = EPOCH;
1496 error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err);
1497 }
1498
1499 /*
1500 * We require 0 < stime_elapsed < 2^31.
1501 * This allows us to binary shift a 32-bit tsc_elapsed such that:
1502 * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
1503 */
1504 while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
1505 ((s32)stime_elapsed64 < 0) )
1506 {
1507 stime_elapsed64 >>= 1;
1508 tsc_elapsed64 >>= 1;
1509 }
1510
1511 /* stime_master_diff now fits in a 32-bit word. */
1512 stime_elapsed32 = (u32)stime_elapsed64;
1513
1514 /* tsc_elapsed <= 2*stime_elapsed */
1515 while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
1516 {
1517 tsc_elapsed64 >>= 1;
1518 tsc_shift--;
1519 }
1520
1521 /* Local difference must now fit in 32 bits. */
1522 ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
1523 tsc_elapsed32 = (u32)tsc_elapsed64;
1524
1525 /* tsc_elapsed > stime_elapsed */
1526 ASSERT(tsc_elapsed32 != 0);
1527 while ( tsc_elapsed32 <= stime_elapsed32 )
1528 {
1529 tsc_elapsed32 <<= 1;
1530 tsc_shift++;
1531 }
1532
1533 calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32);
1534 if ( error_factor != 0 )
1535 calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor);
1536
1537 #if 0
1538 printk("---%d: %08x %08x %d\n", smp_processor_id(),
1539 error_factor, calibration_mul_frac, tsc_shift);
1540 #endif
1541
1542 /* Record new timestamp information, atomically w.r.t. interrupts. */
1543 local_irq_disable();
1544 t->tsc_scale.mul_frac = calibration_mul_frac;
1545 t->tsc_scale.shift = tsc_shift;
1546 t->stamp = curr;
1547 local_irq_enable();
1548
1549 update_vcpu_system_time(current);
1550
1551 out:
1552 if ( smp_processor_id() == 0 )
1553 {
1554 set_timer(&calibration_timer, NOW() + EPOCH);
1555 platform_time_calibration();
1556 }
1557 }
1558
1559 /*
1560 * TSC Reliability check
1561 */
1562
1563 /*
1564 * The Linux original version of this function is
1565 * Copyright (c) 2006, Red Hat, Inc., Ingo Molnar
1566 */
check_tsc_warp(unsigned long tsc_khz,unsigned long * max_warp)1567 static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
1568 {
1569 static DEFINE_SPINLOCK(sync_lock);
1570 static cycles_t last_tsc;
1571
1572 cycles_t start, now, prev, end;
1573 int i;
1574
1575 start = rdtsc_ordered();
1576
1577 /* The measurement runs for 20 msecs: */
1578 end = start + tsc_khz * 20ULL;
1579 now = start;
1580
1581 for ( i = 0; ; i++ )
1582 {
1583 /*
1584 * We take the global lock, measure TSC, save the
1585 * previous TSC that was measured (possibly on
1586 * another CPU) and update the previous TSC timestamp.
1587 */
1588 spin_lock(&sync_lock);
1589 prev = last_tsc;
1590 now = rdtsc_ordered();
1591 last_tsc = now;
1592 spin_unlock(&sync_lock);
1593
1594 /*
1595 * Be nice every now and then (and also check whether measurement is
1596 * done [we also insert a 10 million loops safety exit, so we dont
1597 * lock up in case the TSC readout is totally broken]):
1598 */
1599 if ( unlikely(!(i & 7)) )
1600 {
1601 if ( (now > end) || (i > 10000000) )
1602 break;
1603 cpu_relax();
1604 /*touch_nmi_watchdog();*/
1605 }
1606
1607 /*
1608 * Outside the critical section we can now see whether we saw a
1609 * time-warp of the TSC going backwards:
1610 */
1611 if ( unlikely(prev > now) )
1612 {
1613 spin_lock(&sync_lock);
1614 if ( *max_warp < prev - now )
1615 *max_warp = prev - now;
1616 spin_unlock(&sync_lock);
1617 }
1618 }
1619 }
1620
1621 static unsigned long tsc_max_warp, tsc_check_count;
1622 static cpumask_t tsc_check_cpumask;
1623
tsc_check_slave(void * unused)1624 static void tsc_check_slave(void *unused)
1625 {
1626 unsigned int cpu = smp_processor_id();
1627 local_irq_disable();
1628 while ( !cpumask_test_cpu(cpu, &tsc_check_cpumask) )
1629 cpu_relax();
1630 check_tsc_warp(cpu_khz, &tsc_max_warp);
1631 cpumask_clear_cpu(cpu, &tsc_check_cpumask);
1632 local_irq_enable();
1633 }
1634
tsc_check_reliability(void)1635 static void tsc_check_reliability(void)
1636 {
1637 unsigned int cpu = smp_processor_id();
1638 static DEFINE_SPINLOCK(lock);
1639
1640 spin_lock(&lock);
1641
1642 tsc_check_count++;
1643 smp_call_function(tsc_check_slave, NULL, 0);
1644 cpumask_andnot(&tsc_check_cpumask, &cpu_online_map, cpumask_of(cpu));
1645 local_irq_disable();
1646 check_tsc_warp(cpu_khz, &tsc_max_warp);
1647 local_irq_enable();
1648 while ( !cpumask_empty(&tsc_check_cpumask) )
1649 cpu_relax();
1650
1651 spin_unlock(&lock);
1652 }
1653
1654 /*
1655 * Rendezvous for all CPUs in IRQ context.
1656 * Master CPU snapshots the platform timer.
1657 * All CPUS snapshot their local TSC and extrapolation of system time.
1658 */
1659 struct calibration_rendezvous {
1660 cpumask_t cpu_calibration_map;
1661 atomic_t semaphore;
1662 s_time_t master_stime;
1663 u64 master_tsc_stamp;
1664 };
1665
1666 static void
time_calibration_rendezvous_tail(const struct calibration_rendezvous * r)1667 time_calibration_rendezvous_tail(const struct calibration_rendezvous *r)
1668 {
1669 struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1670
1671 c->local_tsc = rdtsc_ordered();
1672 c->local_stime = get_s_time_fixed(c->local_tsc);
1673 c->master_stime = r->master_stime;
1674
1675 raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1676 }
1677
1678 /*
1679 * Keep TSCs in sync when they run at the same rate, but may stop in
1680 * deep-sleep C states.
1681 */
time_calibration_tsc_rendezvous(void * _r)1682 static void time_calibration_tsc_rendezvous(void *_r)
1683 {
1684 int i;
1685 struct calibration_rendezvous *r = _r;
1686 unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1687
1688 /* Loop to get rid of cache effects on TSC skew. */
1689 for ( i = 4; i >= 0; i-- )
1690 {
1691 if ( smp_processor_id() == 0 )
1692 {
1693 while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1694 cpu_relax();
1695
1696 if ( r->master_stime == 0 )
1697 {
1698 r->master_stime = read_platform_stime(NULL);
1699 r->master_tsc_stamp = rdtsc_ordered();
1700 }
1701 atomic_inc(&r->semaphore);
1702
1703 if ( i == 0 )
1704 write_tsc(r->master_tsc_stamp);
1705
1706 while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) )
1707 cpu_relax();
1708 atomic_set(&r->semaphore, 0);
1709 }
1710 else
1711 {
1712 atomic_inc(&r->semaphore);
1713 while ( atomic_read(&r->semaphore) < total_cpus )
1714 cpu_relax();
1715
1716 if ( i == 0 )
1717 write_tsc(r->master_tsc_stamp);
1718
1719 atomic_inc(&r->semaphore);
1720 while ( atomic_read(&r->semaphore) > total_cpus )
1721 cpu_relax();
1722 }
1723 }
1724
1725 time_calibration_rendezvous_tail(r);
1726 }
1727
1728 /* Ordinary rendezvous function which does not modify TSC values. */
time_calibration_std_rendezvous(void * _r)1729 static void time_calibration_std_rendezvous(void *_r)
1730 {
1731 struct calibration_rendezvous *r = _r;
1732 unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1733
1734 if ( smp_processor_id() == 0 )
1735 {
1736 while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1737 cpu_relax();
1738 r->master_stime = read_platform_stime(NULL);
1739 smp_wmb(); /* write r->master_stime /then/ signal */
1740 atomic_inc(&r->semaphore);
1741 }
1742 else
1743 {
1744 atomic_inc(&r->semaphore);
1745 while ( atomic_read(&r->semaphore) != total_cpus )
1746 cpu_relax();
1747 smp_rmb(); /* receive signal /then/ read r->master_stime */
1748 }
1749
1750 time_calibration_rendezvous_tail(r);
1751 }
1752
1753 /*
1754 * Rendezvous function used when clocksource is TSC and
1755 * no CPU hotplug will be performed.
1756 */
time_calibration_nop_rendezvous(void * rv)1757 static void time_calibration_nop_rendezvous(void *rv)
1758 {
1759 const struct calibration_rendezvous *r = rv;
1760 struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1761
1762 c->local_tsc = r->master_tsc_stamp;
1763 c->local_stime = r->master_stime;
1764 c->master_stime = r->master_stime;
1765
1766 raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1767 }
1768
1769 static void (*time_calibration_rendezvous_fn)(void *) =
1770 time_calibration_std_rendezvous;
1771
time_calibration(void * unused)1772 static void time_calibration(void *unused)
1773 {
1774 struct calibration_rendezvous r = {
1775 .semaphore = ATOMIC_INIT(0)
1776 };
1777
1778 if ( clocksource_is_tsc() )
1779 {
1780 local_irq_disable();
1781 r.master_stime = read_platform_stime(&r.master_tsc_stamp);
1782 local_irq_enable();
1783 }
1784
1785 cpumask_copy(&r.cpu_calibration_map, &cpu_online_map);
1786
1787 /* @wait=1 because we must wait for all cpus before freeing @r. */
1788 on_selected_cpus(&r.cpu_calibration_map,
1789 time_calibration_rendezvous_fn,
1790 &r, 1);
1791 }
1792
1793 static struct cpu_time_stamp ap_bringup_ref;
1794
time_latch_stamps(void)1795 void time_latch_stamps(void)
1796 {
1797 unsigned long flags;
1798
1799 local_irq_save(flags);
1800 ap_bringup_ref.master_stime = read_platform_stime(NULL);
1801 ap_bringup_ref.local_tsc = rdtsc_ordered();
1802 local_irq_restore(flags);
1803
1804 ap_bringup_ref.local_stime = get_s_time_fixed(ap_bringup_ref.local_tsc);
1805 }
1806
init_percpu_time(void)1807 void init_percpu_time(void)
1808 {
1809 struct cpu_time *t = &this_cpu(cpu_time);
1810 unsigned long flags;
1811 u64 tsc;
1812 s_time_t now;
1813
1814 /* Initial estimate for TSC rate. */
1815 t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
1816
1817 if ( tsc_adjust )
1818 {
1819 unsigned int socket = cpu_to_socket(smp_processor_id());
1820 int64_t adj;
1821
1822 /* For now we don't want to come here for the BSP. */
1823 ASSERT(system_state >= SYS_STATE_smp_boot);
1824
1825 rdmsrl(MSR_IA32_TSC_ADJUST, adj);
1826
1827 /*
1828 * Check whether this CPU is the first in a package to come up. In
1829 * this case do not check the boot value against another package
1830 * because the new package might have been physically hotplugged,
1831 * where TSC_ADJUST is expected to be different.
1832 */
1833 if ( cpumask_weight(socket_cpumask[socket]) == 1 )
1834 {
1835 /*
1836 * On the boot CPU we just force the ADJUST value to 0 if it's non-
1837 * zero (in early_time_init()). We don't do that on non-boot CPUs
1838 * because physical hotplug should have set the ADJUST register to a
1839 * value > 0, so the TSC is in sync with the already running CPUs.
1840 *
1841 * But we always force non-negative ADJUST values for now.
1842 */
1843 if ( adj < 0 )
1844 {
1845 printk(XENLOG_WARNING
1846 "TSC ADJUST set to -%lx on CPU%u - clearing\n",
1847 -adj, smp_processor_id());
1848 wrmsrl(MSR_IA32_TSC_ADJUST, 0);
1849 adj = 0;
1850 }
1851 tsc_adjust[socket] = adj;
1852 }
1853 else if ( adj != tsc_adjust[socket] )
1854 {
1855 static bool __read_mostly warned;
1856
1857 if ( !warned )
1858 {
1859 warned = true;
1860 printk(XENLOG_WARNING
1861 "Differing TSC ADJUST values within socket(s) - fixing all\n");
1862 }
1863 wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust[socket]);
1864 }
1865 }
1866
1867 local_irq_save(flags);
1868 now = read_platform_stime(NULL);
1869 tsc = rdtsc_ordered();
1870 local_irq_restore(flags);
1871
1872 t->stamp.master_stime = now;
1873 /*
1874 * To avoid a discontinuity (TSC and platform clock can't be expected
1875 * to be in perfect sync), initialization here needs to match up with
1876 * local_time_calibration()'s decision whether to use its fast path.
1877 */
1878 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1879 {
1880 if ( system_state < SYS_STATE_smp_boot )
1881 now = get_s_time_fixed(tsc);
1882 else
1883 now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
1884 }
1885 t->stamp.local_tsc = tsc;
1886 t->stamp.local_stime = now;
1887 }
1888
1889 /*
1890 * On certain older Intel CPUs writing the TSC MSR clears the upper 32 bits.
1891 * Obviously we must not use write_tsc() on such CPUs.
1892 *
1893 * Additionally, AMD specifies that being able to write the TSC MSR is not an
1894 * architectural feature (but, other than their manual says, also cannot be
1895 * determined from CPUID bits).
1896 */
tsc_check_writability(void)1897 static void __init tsc_check_writability(void)
1898 {
1899 const char *what = NULL;
1900 uint64_t tsc;
1901
1902 /*
1903 * If all CPUs are reported as synchronised and in sync, we never write
1904 * the TSCs (except unavoidably, when a CPU is physically hot-plugged).
1905 * Hence testing for writability is pointless and even harmful.
1906 */
1907 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1908 return;
1909
1910 tsc = rdtsc();
1911 if ( wrmsr_safe(MSR_IA32_TSC, 0) == 0 )
1912 {
1913 uint64_t tmp, tmp2 = rdtsc();
1914
1915 write_tsc(tsc | (1ULL << 32));
1916 tmp = rdtsc();
1917 if ( ABS((s64)tmp - (s64)tmp2) < (1LL << 31) )
1918 what = "only partially";
1919 }
1920 else
1921 {
1922 what = "not";
1923 }
1924
1925 /* Nothing to do if the TSC is fully writable. */
1926 if ( !what )
1927 {
1928 /*
1929 * Paranoia - write back original TSC value. However, APs get synced
1930 * with BSP as they are brought up, so this doesn't much matter.
1931 */
1932 write_tsc(tsc);
1933 return;
1934 }
1935
1936 printk(XENLOG_WARNING "TSC %s writable\n", what);
1937
1938 /* time_calibration_tsc_rendezvous() must not be used */
1939 setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
1940
1941 /* cstate_restore_tsc() must not be used (or do nothing) */
1942 if ( !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
1943 cpuidle_disable_deep_cstate();
1944
1945 /* synchronize_tsc_slave() must do nothing */
1946 disable_tsc_sync = true;
1947 }
1948
reset_percpu_time(void * unused)1949 static void __init reset_percpu_time(void *unused)
1950 {
1951 struct cpu_time *t = &this_cpu(cpu_time);
1952
1953 t->stamp.local_tsc = boot_tsc_stamp;
1954 t->stamp.local_stime = 0;
1955 t->stamp.local_stime = get_s_time_fixed(boot_tsc_stamp);
1956 t->stamp.master_stime = t->stamp.local_stime;
1957 }
1958
try_platform_timer_tail(bool late)1959 static void __init try_platform_timer_tail(bool late)
1960 {
1961 init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
1962 plt_overflow(NULL);
1963
1964 platform_timer_stamp = plt_stamp64;
1965 stime_platform_stamp = NOW();
1966
1967 if ( !late )
1968 init_percpu_time();
1969
1970 init_timer(&calibration_timer, time_calibration, NULL, 0);
1971 set_timer(&calibration_timer, NOW() + EPOCH);
1972 }
1973
1974 /* Late init function, after all cpus have booted */
verify_tsc_reliability(void)1975 static int __init verify_tsc_reliability(void)
1976 {
1977 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1978 {
1979 /*
1980 * Sadly, despite processor vendors' best design guidance efforts, on
1981 * some systems, cpus may come out of reset improperly synchronized.
1982 * So we must verify there is no warp and we can't do that until all
1983 * CPUs are booted.
1984 */
1985 tsc_check_reliability();
1986 if ( tsc_max_warp )
1987 {
1988 printk("TSC warp detected, disabling TSC_RELIABLE\n");
1989 setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1990 }
1991 else if ( !strcmp(opt_clocksource, "tsc") &&
1992 (try_platform_timer(&plt_tsc) > 0) )
1993 {
1994 /*
1995 * Platform timer has changed and CPU time will only be updated
1996 * after we set again the calibration timer, which means we need to
1997 * seed again each local CPU time. At this stage TSC is known to be
1998 * reliable i.e. monotonically increasing across all CPUs so this
1999 * lets us remove the skew between platform timer and TSC, since
2000 * these are now effectively the same.
2001 */
2002 on_selected_cpus(&cpu_online_map, reset_percpu_time, NULL, 1);
2003
2004 /*
2005 * We won't do CPU Hotplug and TSC clocksource is being used which
2006 * means we have a reliable TSC, plus we don't sync with any other
2007 * clocksource so no need for rendezvous.
2008 */
2009 time_calibration_rendezvous_fn = time_calibration_nop_rendezvous;
2010
2011 /* Finish platform timer switch. */
2012 try_platform_timer_tail(true);
2013
2014 printk("Switched to Platform timer %s TSC\n",
2015 freq_string(plt_src.frequency));
2016 return 0;
2017 }
2018 }
2019
2020 /*
2021 * Re-run the TSC writability check if it didn't run to completion, as
2022 * X86_FEATURE_TSC_RELIABLE may have been cleared by now. This is needed
2023 * for determining which rendezvous function to use (below).
2024 */
2025 if ( !disable_tsc_sync )
2026 tsc_check_writability();
2027
2028 /*
2029 * While with constant-rate TSCs the scale factor can be shared, when TSCs
2030 * are not marked as 'reliable', re-sync during rendezvous.
2031 */
2032 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
2033 !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
2034 time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous;
2035
2036 return 0;
2037 }
2038 __initcall(verify_tsc_reliability);
2039
2040 /* Late init function (after interrupts are enabled). */
init_xen_time(void)2041 int __init init_xen_time(void)
2042 {
2043 tsc_check_writability();
2044
2045 open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
2046
2047 /* NB. get_wallclock_time() can take over one second to execute. */
2048 do_settime(get_wallclock_time(), 0, NOW());
2049
2050 /* Finish platform timer initialization. */
2051 try_platform_timer_tail(false);
2052
2053 /*
2054 * Setup space to track per-socket TSC_ADJUST values. Don't fiddle with
2055 * values if the TSC is not reported as invariant. Ignore allocation
2056 * failure here - most systems won't need any adjustment anyway.
2057 */
2058 if ( boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
2059 boot_cpu_has(X86_FEATURE_ITSC) )
2060 tsc_adjust = xzalloc_array(uint64_t, nr_sockets);
2061
2062 return 0;
2063 }
2064
2065
2066 /* Early init function. */
early_time_init(void)2067 void __init early_time_init(void)
2068 {
2069 struct cpu_time *t = &this_cpu(cpu_time);
2070 u64 tmp;
2071
2072 if ( boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
2073 boot_cpu_has(X86_FEATURE_ITSC) )
2074 {
2075 rdmsrl(MSR_IA32_TSC_ADJUST, tmp);
2076 if ( tmp )
2077 {
2078 printk(XENLOG_WARNING
2079 "TSC ADJUST set to %lx on boot CPU - clearing\n", tmp);
2080 wrmsrl(MSR_IA32_TSC_ADJUST, 0);
2081 boot_tsc_stamp -= tmp;
2082 }
2083 }
2084
2085 preinit_pit();
2086 tmp = init_platform_timer();
2087 plt_tsc.frequency = tmp;
2088
2089 set_time_scale(&t->tsc_scale, tmp);
2090 t->stamp.local_tsc = boot_tsc_stamp;
2091
2092 do_div(tmp, 1000);
2093 cpu_khz = (unsigned long)tmp;
2094 printk("Detected %lu.%03lu MHz processor.\n",
2095 cpu_khz / 1000, cpu_khz % 1000);
2096
2097 setup_irq(0, 0, &irq0);
2098 }
2099
2100 /* keep pit enabled for pit_broadcast working while cpuidle enabled */
_disable_pit_irq(void (* hpet_broadcast_setup)(void))2101 static int _disable_pit_irq(void(*hpet_broadcast_setup)(void))
2102 {
2103 int ret = 1;
2104
2105 if ( using_pit || !cpu_has_apic )
2106 return -1;
2107
2108 /*
2109 * If we do not rely on PIT CH0 then we can use HPET for one-shot timer
2110 * emulation when entering deep C states.
2111 * XXX dom0 may rely on RTC interrupt delivery, so only enable
2112 * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
2113 */
2114 if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_ARAT) )
2115 {
2116 hpet_broadcast_setup();
2117 if ( !hpet_broadcast_is_available() )
2118 {
2119 if ( xen_cpuidle > 0 )
2120 {
2121 printk("%ps() failed, turning to PIT broadcast\n",
2122 hpet_broadcast_setup);
2123 return -1;
2124 }
2125 ret = 0;
2126 }
2127 }
2128
2129 /* Disable PIT CH0 timer interrupt. */
2130 outb_p(0x30, PIT_MODE);
2131 outb_p(0, PIT_CH0);
2132 outb_p(0, PIT_CH0);
2133
2134 return ret;
2135 }
2136
disable_pit_irq(void)2137 static int __init disable_pit_irq(void)
2138 {
2139 if ( !_disable_pit_irq(hpet_broadcast_init) )
2140 {
2141 xen_cpuidle = 0;
2142 printk("CPUIDLE: disabled due to no HPET. "
2143 "Force enable with 'cpuidle'.\n");
2144 }
2145
2146 return 0;
2147 }
2148 __initcall(disable_pit_irq);
2149
pit_broadcast_enter(void)2150 void pit_broadcast_enter(void)
2151 {
2152 cpumask_set_cpu(smp_processor_id(), &pit_broadcast_mask);
2153 }
2154
pit_broadcast_exit(void)2155 void pit_broadcast_exit(void)
2156 {
2157 int cpu = smp_processor_id();
2158
2159 if ( cpumask_test_and_clear_cpu(cpu, &pit_broadcast_mask) )
2160 reprogram_timer(this_cpu(timer_deadline));
2161 }
2162
pit_broadcast_is_available(void)2163 int pit_broadcast_is_available(void)
2164 {
2165 return cpuidle_using_deep_cstate();
2166 }
2167
send_timer_event(struct vcpu * v)2168 void send_timer_event(struct vcpu *v)
2169 {
2170 send_guest_vcpu_virq(v, VIRQ_TIMER);
2171 }
2172
2173 /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */
2174 static long cmos_utc_offset; /* in seconds */
2175
time_suspend(void)2176 int time_suspend(void)
2177 {
2178 if ( smp_processor_id() == 0 )
2179 {
2180 cmos_utc_offset = -get_wallclock_time();
2181 cmos_utc_offset += get_sec();
2182 kill_timer(&calibration_timer);
2183
2184 /* Sync platform timer stamps. */
2185 platform_time_calibration();
2186 }
2187
2188 /* Better to cancel calibration timer for accuracy. */
2189 clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));
2190
2191 return 0;
2192 }
2193
time_resume(void)2194 int time_resume(void)
2195 {
2196 preinit_pit();
2197
2198 resume_platform_timer();
2199
2200 if ( !_disable_pit_irq(hpet_broadcast_resume) )
2201 BUG();
2202
2203 init_percpu_time();
2204
2205 set_timer(&calibration_timer, NOW() + EPOCH);
2206
2207 do_settime(get_wallclock_time() + cmos_utc_offset, 0, NOW());
2208
2209 update_vcpu_system_time(current);
2210
2211 update_domain_rtc();
2212
2213 return 0;
2214 }
2215
hwdom_pit_access(struct ioreq * ioreq)2216 int hwdom_pit_access(struct ioreq *ioreq)
2217 {
2218 /* Is Xen using Channel 2? Then disallow direct dom0 access. */
2219 if ( using_pit )
2220 return 0;
2221
2222 switch ( ioreq->addr )
2223 {
2224 case PIT_CH2:
2225 if ( ioreq->dir == IOREQ_READ )
2226 ioreq->data = inb(PIT_CH2);
2227 else
2228 outb(ioreq->data, PIT_CH2);
2229 return 1;
2230
2231 case PIT_MODE:
2232 if ( ioreq->dir == IOREQ_READ )
2233 return 0; /* urk! */
2234 switch ( ioreq->data & 0xc0 )
2235 {
2236 case 0xc0: /* Read Back */
2237 if ( ioreq->data & 0x08 ) /* Select Channel 2? */
2238 outb(ioreq->data & 0xf8, PIT_MODE);
2239 if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */
2240 return 1; /* no - we're done */
2241 /* Filter Channel 2 and reserved bit 0. */
2242 ioreq->data &= ~0x09;
2243 return 0; /* emulate ch0/1 readback */
2244 case 0x80: /* Select Counter 2 */
2245 outb(ioreq->data, PIT_MODE);
2246 return 1;
2247 }
2248 break;
2249
2250 case 0x61:
2251 if ( ioreq->dir == IOREQ_READ )
2252 ioreq->data = inb(0x61);
2253 else
2254 outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61);
2255 return 1;
2256 }
2257
2258 return 0;
2259 }
2260
2261 /*
2262 * PV SoftTSC Emulation.
2263 */
2264
2265 /*
2266 * tsc=unstable: Override all tests; assume TSC is unreliable.
2267 * tsc=skewed: Assume TSCs are individually reliable, but skewed across CPUs.
2268 * tsc=stable:socket: Assume TSCs are reliable across sockets.
2269 */
tsc_parse(const char * s)2270 static int __init tsc_parse(const char *s)
2271 {
2272 if ( !strcmp(s, "unstable") )
2273 {
2274 setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
2275 setup_clear_cpu_cap(X86_FEATURE_NONSTOP_TSC);
2276 setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
2277 }
2278 else if ( !strcmp(s, "skewed") )
2279 setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
2280 else if ( !strcmp(s, "stable:socket") )
2281 tsc_flags |= TSC_RELIABLE_SOCKET;
2282 else
2283 return -EINVAL;
2284
2285 return 0;
2286 }
2287 custom_param("tsc", tsc_parse);
2288
gtime_to_gtsc(struct domain * d,u64 time)2289 u64 gtime_to_gtsc(struct domain *d, u64 time)
2290 {
2291 if ( !is_hvm_domain(d) )
2292 {
2293 if ( time < d->arch.vtsc_offset )
2294 return -scale_delta(d->arch.vtsc_offset - time,
2295 &d->arch.ns_to_vtsc);
2296 time -= d->arch.vtsc_offset;
2297 }
2298 return scale_delta(time, &d->arch.ns_to_vtsc);
2299 }
2300
gtsc_to_gtime(struct domain * d,u64 tsc)2301 u64 gtsc_to_gtime(struct domain *d, u64 tsc)
2302 {
2303 u64 time = scale_delta(tsc, &d->arch.vtsc_to_ns);
2304
2305 if ( !is_hvm_domain(d) )
2306 time += d->arch.vtsc_offset;
2307 return time;
2308 }
2309
pv_soft_rdtsc(const struct vcpu * v,const struct cpu_user_regs * regs)2310 uint64_t pv_soft_rdtsc(const struct vcpu *v, const struct cpu_user_regs *regs)
2311 {
2312 s_time_t old, new, now = get_s_time();
2313 struct domain *d = v->domain;
2314
2315 do {
2316 old = d->arch.vtsc_last;
2317 new = now > d->arch.vtsc_last ? now : old + 1;
2318 } while ( cmpxchg(&d->arch.vtsc_last, old, new) != old );
2319
2320 return gtime_to_gtsc(d, new);
2321 }
2322
clocksource_is_tsc(void)2323 bool clocksource_is_tsc(void)
2324 {
2325 return plt_src.read_counter == read_tsc;
2326 }
2327
host_tsc_is_safe(void)2328 int host_tsc_is_safe(void)
2329 {
2330 return boot_cpu_has(X86_FEATURE_TSC_RELIABLE);
2331 }
2332
2333 /*
2334 * called to collect tsc-related data only for save file or live
2335 * migrate; called after last rdtsc is done on this incarnation
2336 */
tsc_get_info(struct domain * d,uint32_t * tsc_mode,uint64_t * elapsed_nsec,uint32_t * gtsc_khz,uint32_t * incarnation)2337 void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
2338 uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
2339 uint32_t *incarnation)
2340 {
2341 bool enable_tsc_scaling = is_hvm_domain(d) &&
2342 hvm_tsc_scaling_supported && !d->arch.vtsc;
2343
2344 *incarnation = d->arch.incarnation;
2345 *tsc_mode = d->arch.tsc_mode;
2346
2347 switch ( *tsc_mode )
2348 {
2349 uint64_t tsc;
2350
2351 case TSC_MODE_NEVER_EMULATE:
2352 *elapsed_nsec = *gtsc_khz = 0;
2353 break;
2354 case TSC_MODE_DEFAULT:
2355 if ( d->arch.vtsc )
2356 {
2357 case TSC_MODE_ALWAYS_EMULATE:
2358 *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
2359 *gtsc_khz = d->arch.tsc_khz;
2360 break;
2361 }
2362 tsc = rdtsc();
2363 *elapsed_nsec = scale_delta(tsc, &d->arch.vtsc_to_ns);
2364 *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz : cpu_khz;
2365 break;
2366 }
2367
2368 if ( (int64_t)*elapsed_nsec < 0 )
2369 *elapsed_nsec = 0;
2370 }
2371
2372 /*
2373 * This may be called as many as three times for a domain, once when the
2374 * hypervisor creates the domain, once when the toolstack creates the
2375 * domain and, if restoring/migrating, once when saved/migrated values
2376 * are restored. Care must be taken that, if multiple calls occur,
2377 * only the last "sticks" and all are completed before the guest executes
2378 * an rdtsc instruction
2379 */
tsc_set_info(struct domain * d,uint32_t tsc_mode,uint64_t elapsed_nsec,uint32_t gtsc_khz,uint32_t incarnation)2380 int tsc_set_info(struct domain *d,
2381 uint32_t tsc_mode, uint64_t elapsed_nsec,
2382 uint32_t gtsc_khz, uint32_t incarnation)
2383 {
2384 ASSERT(!is_system_domain(d));
2385
2386 if ( is_pv_domain(d) && is_hardware_domain(d) )
2387 {
2388 d->arch.vtsc = 0;
2389 return 0;
2390 }
2391
2392 switch ( tsc_mode )
2393 {
2394 case TSC_MODE_DEFAULT:
2395 case TSC_MODE_ALWAYS_EMULATE:
2396 d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
2397 d->arch.tsc_khz = gtsc_khz ?: cpu_khz;
2398 set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000);
2399
2400 /*
2401 * In default mode use native TSC if the host has safe TSC and
2402 * host and guest frequencies are the same (either "naturally" or
2403 * - for HVM/PVH - via TSC scaling).
2404 * When a guest is created, gtsc_khz is passed in as zero, making
2405 * d->arch.tsc_khz == cpu_khz. Thus no need to check incarnation.
2406 */
2407 if ( tsc_mode == TSC_MODE_DEFAULT && host_tsc_is_safe() &&
2408 (d->arch.tsc_khz == cpu_khz ||
2409 (is_hvm_domain(d) &&
2410 hvm_get_tsc_scaling_ratio(d->arch.tsc_khz))) )
2411 {
2412 case TSC_MODE_NEVER_EMULATE:
2413 d->arch.vtsc = 0;
2414 break;
2415 }
2416 d->arch.vtsc = 1;
2417 d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
2418 break;
2419
2420 default:
2421 return -EINVAL;
2422 }
2423
2424 d->arch.tsc_mode = tsc_mode;
2425
2426 d->arch.incarnation = incarnation + 1;
2427 if ( is_hvm_domain(d) )
2428 {
2429 if ( hvm_tsc_scaling_supported && !d->arch.vtsc )
2430 d->arch.hvm.tsc_scaling_ratio =
2431 hvm_get_tsc_scaling_ratio(d->arch.tsc_khz);
2432
2433 hvm_set_rdtsc_exiting(d, d->arch.vtsc);
2434 if ( d->vcpu && d->vcpu[0] && incarnation == 0 )
2435 {
2436 /*
2437 * set_tsc_offset() is called from hvm_vcpu_initialise() before
2438 * tsc_set_info(). New vtsc mode may require recomputing TSC
2439 * offset.
2440 * We only need to do this for BSP during initial boot. APs will
2441 * call set_tsc_offset() later from hvm_vcpu_reset_state() and they
2442 * will sync their TSC to BSP's sync_tsc.
2443 */
2444 d->arch.hvm.sync_tsc = rdtsc();
2445 hvm_set_tsc_offset(d->vcpu[0],
2446 d->vcpu[0]->arch.hvm.cache_tsc_offset,
2447 d->arch.hvm.sync_tsc);
2448 }
2449 }
2450
2451 recalculate_cpuid_policy(d);
2452
2453 return 0;
2454 }
2455
2456 /* vtsc may incur measurable performance degradation, diagnose with this */
dump_softtsc(unsigned char key)2457 static void dump_softtsc(unsigned char key)
2458 {
2459 struct domain *d;
2460 int domcnt = 0;
2461
2462 tsc_check_reliability();
2463 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
2464 printk("TSC marked as reliable, "
2465 "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2466 else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
2467 {
2468 printk("TSC has constant rate, ");
2469 if ( max_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
2470 printk("no deep Cstates, passed warp test, deemed reliable, ");
2471 else
2472 printk("deep Cstates possible, so not reliable, ");
2473 printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2474 } else
2475 printk("TSC not marked as either constant or reliable, "
2476 "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2477
2478 rcu_read_lock(&domlist_read_lock);
2479
2480 for_each_domain ( d )
2481 {
2482 if ( is_hardware_domain(d) && d->arch.tsc_mode == TSC_MODE_DEFAULT )
2483 continue;
2484 printk("dom%u%s: mode=%d",d->domain_id,
2485 is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
2486 if ( d->arch.vtsc_offset )
2487 printk(",ofs=%#"PRIx64, d->arch.vtsc_offset);
2488 if ( d->arch.tsc_khz )
2489 printk(",khz=%"PRIu32, d->arch.tsc_khz);
2490 if ( d->arch.incarnation )
2491 printk(",inc=%"PRIu32, d->arch.incarnation);
2492 printk("\n");
2493 domcnt++;
2494 }
2495
2496 rcu_read_unlock(&domlist_read_lock);
2497
2498 if ( !domcnt )
2499 printk("No domains have emulated TSC\n");
2500 }
2501
setup_dump_softtsc(void)2502 static int __init setup_dump_softtsc(void)
2503 {
2504 register_keyhandler('s', dump_softtsc, "dump softtsc stats", 1);
2505 return 0;
2506 }
2507 __initcall(setup_dump_softtsc);
2508
2509 /*
2510 * Local variables:
2511 * mode: C
2512 * c-file-style: "BSD"
2513 * c-basic-offset: 4
2514 * tab-width: 4
2515 * indent-tabs-mode: nil
2516 * End:
2517 */
2518