1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson : PM converted to driver model. Disable/enable API.
14  */
15 
16 #include <xen/init.h>
17 #include <xen/lib.h>
18 #include <xen/mm.h>
19 #include <xen/param.h>
20 #include <xen/irq.h>
21 #include <xen/delay.h>
22 #include <xen/time.h>
23 #include <xen/sched.h>
24 #include <xen/console.h>
25 #include <xen/smp.h>
26 #include <xen/keyhandler.h>
27 #include <xen/cpu.h>
28 #include <asm/current.h>
29 #include <asm/mc146818rtc.h>
30 #include <asm/msr.h>
31 #include <asm/mpspec.h>
32 #include <asm/nmi.h>
33 #include <asm/debugger.h>
34 #include <asm/div64.h>
35 #include <asm/apic.h>
36 
37 unsigned int nmi_watchdog = NMI_NONE;
38 static unsigned int nmi_hz = HZ;
39 static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
40 static unsigned int nmi_p4_cccr_val;
41 static unsigned int nmi_p6_event_width;
42 static DEFINE_PER_CPU(struct timer, nmi_timer);
43 static DEFINE_PER_CPU(unsigned int, nmi_timer_ticks);
44 
45 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
46 bool __initdata opt_watchdog;
47 
48 /* watchdog_force: If true, process unknown NMIs when running the watchdog. */
49 bool watchdog_force;
50 
parse_watchdog(const char * s)51 static int __init parse_watchdog(const char *s)
52 {
53     if ( !*s )
54     {
55         opt_watchdog = true;
56         return 0;
57     }
58 
59     switch ( parse_bool(s, NULL) )
60     {
61     case 0:
62         opt_watchdog = false;
63         return 0;
64     case 1:
65         opt_watchdog = true;
66         return 0;
67     }
68 
69     if ( !strcmp(s, "force") )
70         watchdog_force = opt_watchdog = true;
71     else
72         return -EINVAL;
73 
74     return 0;
75 }
76 custom_param("watchdog", parse_watchdog);
77 
78 /* opt_watchdog_timeout: Number of seconds to wait before panic. */
79 static unsigned int opt_watchdog_timeout = 5;
80 
parse_watchdog_timeout(const char * s)81 static int parse_watchdog_timeout(const char *s)
82 {
83     const char *q;
84 
85     opt_watchdog_timeout = simple_strtoull(s, &q, 0);
86     opt_watchdog = !!opt_watchdog_timeout;
87 
88     return *q ? -EINVAL : 0;
89 }
90 custom_param("watchdog_timeout", parse_watchdog_timeout);
91 
92 /*
93  * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
94  * - it may be reserved by some other driver, or not
95  * - when not reserved by some other driver, it may be used for
96  *   the NMI watchdog, or not
97  *
98  * This is maintained separately from nmi_active because the NMI
99  * watchdog may also be driven from the I/O APIC timer.
100  */
101 static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
102 static unsigned int lapic_nmi_owner;
103 #define LAPIC_NMI_WATCHDOG	(1<<0)
104 #define LAPIC_NMI_RESERVED	(1<<1)
105 
106 /* nmi_active:
107  * +1: the lapic NMI watchdog is active, but can be disabled
108  *  0: the lapic NMI watchdog has not been set up, and cannot
109  *     be enabled
110  * -1: the lapic NMI watchdog is disabled, but can be enabled
111  */
112 int nmi_active;
113 
114 #define K7_EVNTSEL_ENABLE	(1 << 22)
115 #define K7_EVNTSEL_INT		(1 << 20)
116 #define K7_EVNTSEL_OS		(1 << 17)
117 #define K7_EVNTSEL_USR		(1 << 16)
118 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
119 #define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
120 #define K7_EVENT_WIDTH          32
121 
122 #define P6_EVNTSEL0_ENABLE	(1 << 22)
123 #define P6_EVNTSEL_INT		(1 << 20)
124 #define P6_EVNTSEL_OS		(1 << 17)
125 #define P6_EVNTSEL_USR		(1 << 16)
126 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED	 0x79
127 #define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c
128 /* Bit width of IA32_PMCx MSRs is reported using CPUID.0AH:EAX[23:16]. */
129 #define P6_EVENT_WIDTH_MASK	(((1 << 8) - 1) << 16)
130 #define P6_EVENT_WIDTH_MIN	32
131 
132 #define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
133 #define P4_CCCR_OVF_PMI0	(1<<26)
134 #define P4_CCCR_OVF_PMI1	(1<<27)
135 #define P4_CCCR_OVF		(1<<31)
136 #define P4_CCCR_THRESHOLD(N)	((N)<<20)
137 #define P4_CCCR_COMPLEMENT	(1<<19)
138 #define P4_CCCR_COMPARE		(1<<18)
139 #define P4_CCCR_REQUIRED	(3<<16)
140 #define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
141 #define P4_CCCR_ENABLE		(1<<12)
142 /*
143  * Set up IQ_PERFCTR0 to behave like a clock, by having IQ_CCCR0 filter
144  * CRU_ESCR0 (with any non-null event selector) through a complemented
145  * max threshold. [IA32-Vol3, Section 14.9.9]
146  */
147 #define P4_NMI_CRU_ESCR0	P4_ESCR_EVENT_SELECT(0x3F)
148 #define P4_NMI_IQ_CCCR0	\
149     (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
150      P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
151 
wait_for_nmis(void * p)152 static void __init wait_for_nmis(void *p)
153 {
154     unsigned int start_count = this_cpu(nmi_count);
155     unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz;
156     unsigned long s, e;
157 
158     s = rdtsc();
159     do {
160         cpu_relax();
161         if ( this_cpu(nmi_count) >= start_count + 2 )
162             break;
163         e = rdtsc();
164     } while( e - s < ticks );
165 }
166 
check_nmi_watchdog(void)167 void __init check_nmi_watchdog(void)
168 {
169     static unsigned int __initdata prev_nmi_count[NR_CPUS];
170     int cpu;
171     bool ok = true;
172 
173     if ( nmi_watchdog == NMI_NONE )
174         return;
175 
176     printk("Testing NMI watchdog on all CPUs:");
177 
178     for_each_online_cpu ( cpu )
179         prev_nmi_count[cpu] = per_cpu(nmi_count, cpu);
180 
181     /*
182      * Wait at most 10 ticks for 2 watchdog NMIs on each CPU.
183      * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog
184      * uses only runs while the core's not halted
185      */
186     on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1);
187 
188     for_each_online_cpu ( cpu )
189     {
190         if ( per_cpu(nmi_count, cpu) - prev_nmi_count[cpu] < 2 )
191         {
192             printk(" %d", cpu);
193             ok = false;
194         }
195     }
196 
197     printk(" %s\n", ok ? "ok" : "stuck");
198 
199     /*
200      * Now that we know it works we can reduce NMI frequency to
201      * something more reasonable; makes a difference in some configs.
202      * There's a limit to how slow we can go because writing the perfctr
203      * MSRs only sets the low 32 bits, with the top 8 bits sign-extended
204      * from those, so it's not possible to set up a delay larger than
205      * 2^31 cycles and smaller than (2^40 - 2^31) cycles.
206      * (Intel SDM, section 18.22.2)
207      */
208     if ( nmi_watchdog == NMI_LOCAL_APIC )
209         nmi_hz = max(1ul, cpu_khz >> 20);
210 
211     return;
212 }
213 
nmi_timer_fn(void * unused)214 static void nmi_timer_fn(void *unused)
215 {
216     this_cpu(nmi_timer_ticks)++;
217     set_timer(&this_cpu(nmi_timer), NOW() + MILLISECS(1000));
218 }
219 
disable_lapic_nmi_watchdog(void)220 void disable_lapic_nmi_watchdog(void)
221 {
222     if (nmi_active <= 0)
223         return;
224     switch (boot_cpu_data.x86_vendor) {
225     case X86_VENDOR_AMD:
226         wrmsr(MSR_K7_EVNTSEL0, 0, 0);
227         break;
228     case X86_VENDOR_INTEL:
229         switch (boot_cpu_data.x86) {
230         case 6:
231             wrmsr(MSR_P6_EVNTSEL(0), 0, 0);
232             break;
233         case 15:
234             wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
235             wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
236             break;
237         }
238         break;
239     }
240     nmi_active = -1;
241     /* tell do_nmi() and others that we're not active any more */
242     nmi_watchdog = NMI_NONE;
243 }
244 
enable_lapic_nmi_watchdog(void)245 static void enable_lapic_nmi_watchdog(void)
246 {
247     if (nmi_active < 0) {
248         nmi_watchdog = NMI_LOCAL_APIC;
249         setup_apic_nmi_watchdog();
250     }
251 }
252 
reserve_lapic_nmi(void)253 int reserve_lapic_nmi(void)
254 {
255     unsigned int old_owner;
256 
257     spin_lock(&lapic_nmi_owner_lock);
258     old_owner = lapic_nmi_owner;
259     lapic_nmi_owner |= LAPIC_NMI_RESERVED;
260     spin_unlock(&lapic_nmi_owner_lock);
261     if (old_owner & LAPIC_NMI_RESERVED)
262         return -EBUSY;
263     if (old_owner & LAPIC_NMI_WATCHDOG)
264         disable_lapic_nmi_watchdog();
265     return 0;
266 }
267 
release_lapic_nmi(void)268 void release_lapic_nmi(void)
269 {
270     unsigned int new_owner;
271 
272     spin_lock(&lapic_nmi_owner_lock);
273     new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
274     lapic_nmi_owner = new_owner;
275     spin_unlock(&lapic_nmi_owner_lock);
276     if (new_owner & LAPIC_NMI_WATCHDOG)
277         enable_lapic_nmi_watchdog();
278 }
279 
280 /*
281  * Activate the NMI watchdog via the local APIC.
282  * Original code written by Keith Owens.
283  */
284 
clear_msr_range(unsigned int base,unsigned int n)285 static void clear_msr_range(unsigned int base, unsigned int n)
286 {
287     unsigned int i;
288 
289     for (i = 0; i < n; i++)
290         wrmsr(base+i, 0, 0);
291 }
292 
write_watchdog_counter(const char * descr)293 static inline void write_watchdog_counter(const char *descr)
294 {
295     u64 count = (u64)cpu_khz * 1000;
296 
297     do_div(count, nmi_hz);
298     if(descr)
299         Dprintk("setting %s to -%#"PRIx64"\n", descr, count);
300     wrmsrl(nmi_perfctr_msr, 0 - count);
301 }
302 
setup_k7_watchdog(void)303 static void setup_k7_watchdog(void)
304 {
305     unsigned int evntsel;
306 
307     nmi_perfctr_msr = MSR_K7_PERFCTR0;
308 
309     clear_msr_range(MSR_K7_EVNTSEL0, 4);
310     clear_msr_range(MSR_K7_PERFCTR0, 4);
311 
312     evntsel = K7_EVNTSEL_INT
313         | K7_EVNTSEL_OS
314         | K7_EVNTSEL_USR
315         | K7_NMI_EVENT;
316 
317     wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
318     write_watchdog_counter("K7_PERFCTR0");
319     apic_write(APIC_LVTPC, APIC_DM_NMI);
320     evntsel |= K7_EVNTSEL_ENABLE;
321     wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
322 }
323 
setup_p6_watchdog(unsigned counter)324 static void setup_p6_watchdog(unsigned counter)
325 {
326     unsigned int evntsel;
327 
328     nmi_perfctr_msr = MSR_P6_PERFCTR(0);
329 
330     if ( !nmi_p6_event_width && current_cpu_data.cpuid_level >= 0xa )
331         nmi_p6_event_width = MASK_EXTR(cpuid_eax(0xa), P6_EVENT_WIDTH_MASK);
332     if ( !nmi_p6_event_width )
333         nmi_p6_event_width = P6_EVENT_WIDTH_MIN;
334 
335     if ( nmi_p6_event_width < P6_EVENT_WIDTH_MIN ||
336          nmi_p6_event_width > BITS_PER_LONG )
337         return;
338 
339     clear_msr_range(MSR_P6_EVNTSEL(0), 2);
340     clear_msr_range(MSR_P6_PERFCTR(0), 2);
341 
342     evntsel = P6_EVNTSEL_INT
343         | P6_EVNTSEL_OS
344         | P6_EVNTSEL_USR
345         | counter;
346 
347     wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0);
348     write_watchdog_counter("P6_PERFCTR0");
349     apic_write(APIC_LVTPC, APIC_DM_NMI);
350     evntsel |= P6_EVNTSEL0_ENABLE;
351     wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0);
352 }
353 
setup_p4_watchdog(void)354 static int setup_p4_watchdog(void)
355 {
356     uint64_t misc_enable;
357 
358     rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
359     if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL))
360         return 0;
361 
362     nmi_perfctr_msr = MSR_P4_IQ_PERFCTR0;
363     nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
364     if ( boot_cpu_data.x86_num_siblings == 2 )
365         nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
366 
367     if (!(misc_enable & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
368         clear_msr_range(0x3F1, 2);
369     /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
370        docs doesn't fully define it, so leave it alone for now. */
371     if (boot_cpu_data.x86_model >= 0x3) {
372         /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
373         clear_msr_range(0x3A0, 26);
374         clear_msr_range(0x3BC, 3);
375     } else {
376         clear_msr_range(0x3A0, 31);
377     }
378     clear_msr_range(0x3C0, 6);
379     clear_msr_range(0x3C8, 6);
380     clear_msr_range(0x3E0, 2);
381     clear_msr_range(MSR_P4_BPU_CCCR0, 18);
382     clear_msr_range(MSR_P4_BPU_PERFCTR0, 18);
383 
384     wrmsrl(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0);
385     wrmsrl(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE);
386     write_watchdog_counter("P4_IQ_COUNTER0");
387     apic_write(APIC_LVTPC, APIC_DM_NMI);
388     wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val);
389     return 1;
390 }
391 
setup_apic_nmi_watchdog(void)392 void setup_apic_nmi_watchdog(void)
393 {
394     if ( nmi_watchdog == NMI_NONE )
395         return;
396 
397     switch (boot_cpu_data.x86_vendor) {
398     case X86_VENDOR_AMD:
399         switch (boot_cpu_data.x86) {
400         case 6:
401         case 0xf ... 0x19:
402             setup_k7_watchdog();
403             break;
404         default:
405             return;
406         }
407         break;
408     case X86_VENDOR_INTEL:
409         switch (boot_cpu_data.x86) {
410         case 6:
411             setup_p6_watchdog((boot_cpu_data.x86_model < 14)
412                               ? P6_EVENT_CPU_CLOCKS_NOT_HALTED
413                               : CORE_EVENT_CPU_CLOCKS_NOT_HALTED);
414             break;
415         case 15:
416             if (!setup_p4_watchdog())
417                 return;
418             break;
419         default:
420             return;
421         }
422         break;
423     default:
424         return;
425     }
426 
427     lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
428     nmi_active = 1;
429 }
430 
cpu_nmi_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)431 static int cpu_nmi_callback(
432     struct notifier_block *nfb, unsigned long action, void *hcpu)
433 {
434     unsigned int cpu = (unsigned long)hcpu;
435 
436     switch ( action )
437     {
438     case CPU_UP_PREPARE:
439         init_timer(&per_cpu(nmi_timer, cpu), nmi_timer_fn, NULL, cpu);
440         set_timer(&per_cpu(nmi_timer, cpu), NOW());
441         break;
442     case CPU_UP_CANCELED:
443     case CPU_DEAD:
444         kill_timer(&per_cpu(nmi_timer, cpu));
445         break;
446     default:
447         break;
448     }
449 
450     return NOTIFY_DONE;
451 }
452 
453 static struct notifier_block cpu_nmi_nfb = {
454     .notifier_call = cpu_nmi_callback
455 };
456 
457 static DEFINE_PER_CPU(unsigned int, last_irq_sums);
458 static DEFINE_PER_CPU(unsigned int, alert_counter);
459 
460 static atomic_t watchdog_disable_count = ATOMIC_INIT(1);
461 
watchdog_disable(void)462 void watchdog_disable(void)
463 {
464     atomic_inc(&watchdog_disable_count);
465 }
466 
watchdog_enable(void)467 void watchdog_enable(void)
468 {
469     atomic_dec(&watchdog_disable_count);
470 }
471 
watchdog_enabled(void)472 bool watchdog_enabled(void)
473 {
474     return !atomic_read(&watchdog_disable_count);
475 }
476 
watchdog_setup(void)477 int __init watchdog_setup(void)
478 {
479     unsigned int cpu;
480 
481     /*
482      * Activate periodic heartbeats. We cannot do this earlier during
483      * setup because the timer infrastructure is not available.
484      */
485     for_each_online_cpu ( cpu )
486         cpu_nmi_callback(&cpu_nmi_nfb, CPU_UP_PREPARE, (void *)(long)cpu);
487     register_cpu_notifier(&cpu_nmi_nfb);
488 
489     watchdog_enable();
490     return 0;
491 }
492 
493 /* Returns false if this was not a watchdog NMI, true otherwise */
nmi_watchdog_tick(const struct cpu_user_regs * regs)494 bool nmi_watchdog_tick(const struct cpu_user_regs *regs)
495 {
496     bool watchdog_tick = true;
497     unsigned int sum = this_cpu(nmi_timer_ticks);
498 
499     if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() )
500     {
501         /*
502          * Ayiee, looks like this CPU is stuck ... wait for the timeout
503          * before doing the oops ...
504          */
505         this_cpu(alert_counter)++;
506         if ( this_cpu(alert_counter) == opt_watchdog_timeout*nmi_hz )
507         {
508             console_force_unlock();
509             printk("Watchdog timer detects that CPU%d is stuck!\n",
510                    smp_processor_id());
511             fatal_trap(regs, 1);
512         }
513     }
514     else
515     {
516         this_cpu(last_irq_sums) = sum;
517         this_cpu(alert_counter) = 0;
518     }
519 
520     if ( nmi_perfctr_msr )
521     {
522         uint64_t msr_content;
523 
524         /* Work out if this is a watchdog tick by checking for overflow. */
525         if ( nmi_perfctr_msr == MSR_P4_IQ_PERFCTR0 )
526         {
527             rdmsrl(MSR_P4_IQ_CCCR0, msr_content);
528             if ( !(msr_content & P4_CCCR_OVF) )
529                 watchdog_tick = false;
530 
531             /*
532              * P4 quirks:
533              * - An overflown perfctr will assert its interrupt
534              *   until the OVF flag in its CCCR is cleared.
535              * - LVTPC is masked on interrupt and must be
536              *   unmasked by the LVTPC handler.
537              */
538             wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val);
539             apic_write(APIC_LVTPC, APIC_DM_NMI);
540         }
541         else if ( nmi_perfctr_msr == MSR_P6_PERFCTR(0) )
542         {
543             rdmsrl(MSR_P6_PERFCTR(0), msr_content);
544             if ( msr_content & (1ULL << (nmi_p6_event_width - 1)) )
545                 watchdog_tick = false;
546 
547             /*
548              * Only P6 based Pentium M need to re-unmask the apic vector but
549              * it doesn't hurt other P6 variants.
550              */
551             apic_write(APIC_LVTPC, APIC_DM_NMI);
552         }
553         else if ( nmi_perfctr_msr == MSR_K7_PERFCTR0 )
554         {
555             rdmsrl(MSR_K7_PERFCTR0, msr_content);
556             if ( msr_content & (1ULL << K7_EVENT_WIDTH) )
557                 watchdog_tick = false;
558         }
559         write_watchdog_counter(NULL);
560     }
561 
562     return watchdog_tick;
563 }
564 
565 /*
566  * For some reason the destination shorthand for self is not valid
567  * when used with the NMI delivery mode. This is documented in Tables
568  * 8-3 and 8-4 in IA32 Reference Manual Volume 3. We send the IPI to
569  * our own APIC ID explicitly which is valid.
570  */
self_nmi(void)571 void self_nmi(void)
572 {
573     unsigned long flags;
574     u32 id = get_apic_id();
575     local_irq_save(flags);
576     apic_wait_icr_idle();
577     apic_icr_write(APIC_DM_NMI | APIC_DEST_PHYSICAL, id);
578     local_irq_restore(flags);
579 }
580 
do_nmi_trigger(unsigned char key)581 static void do_nmi_trigger(unsigned char key)
582 {
583     printk("Triggering NMI on APIC ID %x\n", get_apic_id());
584     self_nmi();
585 }
586 
do_nmi_stats(unsigned char key)587 static void do_nmi_stats(unsigned char key)
588 {
589     const struct vcpu *v;
590     unsigned int cpu;
591     bool pend, mask;
592 
593     printk("CPU\tNMI\n");
594     for_each_online_cpu ( cpu )
595         printk("%3u\t%3u\n", cpu, per_cpu(nmi_count, cpu));
596 
597     if ( !hardware_domain || !(v = domain_vcpu(hardware_domain, 0)) )
598         return;
599 
600     pend = v->arch.nmi_pending;
601     mask = v->arch.async_exception_mask & (1 << VCPU_TRAP_NMI);
602     if ( pend || mask )
603         printk("%pv: NMI%s%s\n",
604                v, pend ? " pending" : "", mask ? " masked" : "");
605     else
606         printk("%pv: NMI neither pending nor masked\n", v);
607 }
608 
register_nmi_trigger(void)609 static __init int register_nmi_trigger(void)
610 {
611     register_keyhandler('N', do_nmi_trigger, "trigger an NMI", 0);
612     register_keyhandler('n', do_nmi_stats, "NMI statistics", 1);
613     return 0;
614 }
615 __initcall(register_nmi_trigger);
616