1 /*
2 * linux/arch/i386/nmi.c
3 *
4 * NMI watchdog support on APIC systems
5 *
6 * Started by Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
12 * Pavel Machek and
13 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
14 */
15
16 #include <xen/init.h>
17 #include <xen/lib.h>
18 #include <xen/mm.h>
19 #include <xen/param.h>
20 #include <xen/irq.h>
21 #include <xen/delay.h>
22 #include <xen/time.h>
23 #include <xen/sched.h>
24 #include <xen/console.h>
25 #include <xen/smp.h>
26 #include <xen/keyhandler.h>
27 #include <xen/cpu.h>
28 #include <asm/current.h>
29 #include <asm/mc146818rtc.h>
30 #include <asm/msr.h>
31 #include <asm/mpspec.h>
32 #include <asm/nmi.h>
33 #include <asm/debugger.h>
34 #include <asm/div64.h>
35 #include <asm/apic.h>
36
37 unsigned int nmi_watchdog = NMI_NONE;
38 static unsigned int nmi_hz = HZ;
39 static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
40 static unsigned int nmi_p4_cccr_val;
41 static unsigned int nmi_p6_event_width;
42 static DEFINE_PER_CPU(struct timer, nmi_timer);
43 static DEFINE_PER_CPU(unsigned int, nmi_timer_ticks);
44
45 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
46 bool __initdata opt_watchdog;
47
48 /* watchdog_force: If true, process unknown NMIs when running the watchdog. */
49 bool watchdog_force;
50
parse_watchdog(const char * s)51 static int __init parse_watchdog(const char *s)
52 {
53 if ( !*s )
54 {
55 opt_watchdog = true;
56 return 0;
57 }
58
59 switch ( parse_bool(s, NULL) )
60 {
61 case 0:
62 opt_watchdog = false;
63 return 0;
64 case 1:
65 opt_watchdog = true;
66 return 0;
67 }
68
69 if ( !strcmp(s, "force") )
70 watchdog_force = opt_watchdog = true;
71 else
72 return -EINVAL;
73
74 return 0;
75 }
76 custom_param("watchdog", parse_watchdog);
77
78 /* opt_watchdog_timeout: Number of seconds to wait before panic. */
79 static unsigned int opt_watchdog_timeout = 5;
80
parse_watchdog_timeout(const char * s)81 static int parse_watchdog_timeout(const char *s)
82 {
83 const char *q;
84
85 opt_watchdog_timeout = simple_strtoull(s, &q, 0);
86 opt_watchdog = !!opt_watchdog_timeout;
87
88 return *q ? -EINVAL : 0;
89 }
90 custom_param("watchdog_timeout", parse_watchdog_timeout);
91
92 /*
93 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
94 * - it may be reserved by some other driver, or not
95 * - when not reserved by some other driver, it may be used for
96 * the NMI watchdog, or not
97 *
98 * This is maintained separately from nmi_active because the NMI
99 * watchdog may also be driven from the I/O APIC timer.
100 */
101 static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
102 static unsigned int lapic_nmi_owner;
103 #define LAPIC_NMI_WATCHDOG (1<<0)
104 #define LAPIC_NMI_RESERVED (1<<1)
105
106 /* nmi_active:
107 * +1: the lapic NMI watchdog is active, but can be disabled
108 * 0: the lapic NMI watchdog has not been set up, and cannot
109 * be enabled
110 * -1: the lapic NMI watchdog is disabled, but can be enabled
111 */
112 int nmi_active;
113
114 #define K7_EVNTSEL_ENABLE (1 << 22)
115 #define K7_EVNTSEL_INT (1 << 20)
116 #define K7_EVNTSEL_OS (1 << 17)
117 #define K7_EVNTSEL_USR (1 << 16)
118 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
119 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
120 #define K7_EVENT_WIDTH 32
121
122 #define P6_EVNTSEL0_ENABLE (1 << 22)
123 #define P6_EVNTSEL_INT (1 << 20)
124 #define P6_EVNTSEL_OS (1 << 17)
125 #define P6_EVNTSEL_USR (1 << 16)
126 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
127 #define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c
128 /* Bit width of IA32_PMCx MSRs is reported using CPUID.0AH:EAX[23:16]. */
129 #define P6_EVENT_WIDTH_MASK (((1 << 8) - 1) << 16)
130 #define P6_EVENT_WIDTH_MIN 32
131
132 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
133 #define P4_CCCR_OVF_PMI0 (1<<26)
134 #define P4_CCCR_OVF_PMI1 (1<<27)
135 #define P4_CCCR_OVF (1<<31)
136 #define P4_CCCR_THRESHOLD(N) ((N)<<20)
137 #define P4_CCCR_COMPLEMENT (1<<19)
138 #define P4_CCCR_COMPARE (1<<18)
139 #define P4_CCCR_REQUIRED (3<<16)
140 #define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
141 #define P4_CCCR_ENABLE (1<<12)
142 /*
143 * Set up IQ_PERFCTR0 to behave like a clock, by having IQ_CCCR0 filter
144 * CRU_ESCR0 (with any non-null event selector) through a complemented
145 * max threshold. [IA32-Vol3, Section 14.9.9]
146 */
147 #define P4_NMI_CRU_ESCR0 P4_ESCR_EVENT_SELECT(0x3F)
148 #define P4_NMI_IQ_CCCR0 \
149 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
150 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
151
wait_for_nmis(void * p)152 static void __init wait_for_nmis(void *p)
153 {
154 unsigned int start_count = this_cpu(nmi_count);
155 unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz;
156 unsigned long s, e;
157
158 s = rdtsc();
159 do {
160 cpu_relax();
161 if ( this_cpu(nmi_count) >= start_count + 2 )
162 break;
163 e = rdtsc();
164 } while( e - s < ticks );
165 }
166
check_nmi_watchdog(void)167 void __init check_nmi_watchdog(void)
168 {
169 static unsigned int __initdata prev_nmi_count[NR_CPUS];
170 int cpu;
171 bool ok = true;
172
173 if ( nmi_watchdog == NMI_NONE )
174 return;
175
176 printk("Testing NMI watchdog on all CPUs:");
177
178 for_each_online_cpu ( cpu )
179 prev_nmi_count[cpu] = per_cpu(nmi_count, cpu);
180
181 /*
182 * Wait at most 10 ticks for 2 watchdog NMIs on each CPU.
183 * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog
184 * uses only runs while the core's not halted
185 */
186 on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1);
187
188 for_each_online_cpu ( cpu )
189 {
190 if ( per_cpu(nmi_count, cpu) - prev_nmi_count[cpu] < 2 )
191 {
192 printk(" %d", cpu);
193 ok = false;
194 }
195 }
196
197 printk(" %s\n", ok ? "ok" : "stuck");
198
199 /*
200 * Now that we know it works we can reduce NMI frequency to
201 * something more reasonable; makes a difference in some configs.
202 * There's a limit to how slow we can go because writing the perfctr
203 * MSRs only sets the low 32 bits, with the top 8 bits sign-extended
204 * from those, so it's not possible to set up a delay larger than
205 * 2^31 cycles and smaller than (2^40 - 2^31) cycles.
206 * (Intel SDM, section 18.22.2)
207 */
208 if ( nmi_watchdog == NMI_LOCAL_APIC )
209 nmi_hz = max(1ul, cpu_khz >> 20);
210
211 return;
212 }
213
nmi_timer_fn(void * unused)214 static void nmi_timer_fn(void *unused)
215 {
216 this_cpu(nmi_timer_ticks)++;
217 set_timer(&this_cpu(nmi_timer), NOW() + MILLISECS(1000));
218 }
219
disable_lapic_nmi_watchdog(void)220 void disable_lapic_nmi_watchdog(void)
221 {
222 if (nmi_active <= 0)
223 return;
224 switch (boot_cpu_data.x86_vendor) {
225 case X86_VENDOR_AMD:
226 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
227 break;
228 case X86_VENDOR_INTEL:
229 switch (boot_cpu_data.x86) {
230 case 6:
231 wrmsr(MSR_P6_EVNTSEL(0), 0, 0);
232 break;
233 case 15:
234 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
235 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
236 break;
237 }
238 break;
239 }
240 nmi_active = -1;
241 /* tell do_nmi() and others that we're not active any more */
242 nmi_watchdog = NMI_NONE;
243 }
244
enable_lapic_nmi_watchdog(void)245 static void enable_lapic_nmi_watchdog(void)
246 {
247 if (nmi_active < 0) {
248 nmi_watchdog = NMI_LOCAL_APIC;
249 setup_apic_nmi_watchdog();
250 }
251 }
252
reserve_lapic_nmi(void)253 int reserve_lapic_nmi(void)
254 {
255 unsigned int old_owner;
256
257 spin_lock(&lapic_nmi_owner_lock);
258 old_owner = lapic_nmi_owner;
259 lapic_nmi_owner |= LAPIC_NMI_RESERVED;
260 spin_unlock(&lapic_nmi_owner_lock);
261 if (old_owner & LAPIC_NMI_RESERVED)
262 return -EBUSY;
263 if (old_owner & LAPIC_NMI_WATCHDOG)
264 disable_lapic_nmi_watchdog();
265 return 0;
266 }
267
release_lapic_nmi(void)268 void release_lapic_nmi(void)
269 {
270 unsigned int new_owner;
271
272 spin_lock(&lapic_nmi_owner_lock);
273 new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
274 lapic_nmi_owner = new_owner;
275 spin_unlock(&lapic_nmi_owner_lock);
276 if (new_owner & LAPIC_NMI_WATCHDOG)
277 enable_lapic_nmi_watchdog();
278 }
279
280 /*
281 * Activate the NMI watchdog via the local APIC.
282 * Original code written by Keith Owens.
283 */
284
clear_msr_range(unsigned int base,unsigned int n)285 static void clear_msr_range(unsigned int base, unsigned int n)
286 {
287 unsigned int i;
288
289 for (i = 0; i < n; i++)
290 wrmsr(base+i, 0, 0);
291 }
292
write_watchdog_counter(const char * descr)293 static inline void write_watchdog_counter(const char *descr)
294 {
295 u64 count = (u64)cpu_khz * 1000;
296
297 do_div(count, nmi_hz);
298 if(descr)
299 Dprintk("setting %s to -%#"PRIx64"\n", descr, count);
300 wrmsrl(nmi_perfctr_msr, 0 - count);
301 }
302
setup_k7_watchdog(void)303 static void setup_k7_watchdog(void)
304 {
305 unsigned int evntsel;
306
307 nmi_perfctr_msr = MSR_K7_PERFCTR0;
308
309 clear_msr_range(MSR_K7_EVNTSEL0, 4);
310 clear_msr_range(MSR_K7_PERFCTR0, 4);
311
312 evntsel = K7_EVNTSEL_INT
313 | K7_EVNTSEL_OS
314 | K7_EVNTSEL_USR
315 | K7_NMI_EVENT;
316
317 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
318 write_watchdog_counter("K7_PERFCTR0");
319 apic_write(APIC_LVTPC, APIC_DM_NMI);
320 evntsel |= K7_EVNTSEL_ENABLE;
321 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
322 }
323
setup_p6_watchdog(unsigned counter)324 static void setup_p6_watchdog(unsigned counter)
325 {
326 unsigned int evntsel;
327
328 nmi_perfctr_msr = MSR_P6_PERFCTR(0);
329
330 if ( !nmi_p6_event_width && current_cpu_data.cpuid_level >= 0xa )
331 nmi_p6_event_width = MASK_EXTR(cpuid_eax(0xa), P6_EVENT_WIDTH_MASK);
332 if ( !nmi_p6_event_width )
333 nmi_p6_event_width = P6_EVENT_WIDTH_MIN;
334
335 if ( nmi_p6_event_width < P6_EVENT_WIDTH_MIN ||
336 nmi_p6_event_width > BITS_PER_LONG )
337 return;
338
339 clear_msr_range(MSR_P6_EVNTSEL(0), 2);
340 clear_msr_range(MSR_P6_PERFCTR(0), 2);
341
342 evntsel = P6_EVNTSEL_INT
343 | P6_EVNTSEL_OS
344 | P6_EVNTSEL_USR
345 | counter;
346
347 wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0);
348 write_watchdog_counter("P6_PERFCTR0");
349 apic_write(APIC_LVTPC, APIC_DM_NMI);
350 evntsel |= P6_EVNTSEL0_ENABLE;
351 wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0);
352 }
353
setup_p4_watchdog(void)354 static int setup_p4_watchdog(void)
355 {
356 uint64_t misc_enable;
357
358 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
359 if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL))
360 return 0;
361
362 nmi_perfctr_msr = MSR_P4_IQ_PERFCTR0;
363 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
364 if ( boot_cpu_data.x86_num_siblings == 2 )
365 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
366
367 if (!(misc_enable & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
368 clear_msr_range(0x3F1, 2);
369 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
370 docs doesn't fully define it, so leave it alone for now. */
371 if (boot_cpu_data.x86_model >= 0x3) {
372 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
373 clear_msr_range(0x3A0, 26);
374 clear_msr_range(0x3BC, 3);
375 } else {
376 clear_msr_range(0x3A0, 31);
377 }
378 clear_msr_range(0x3C0, 6);
379 clear_msr_range(0x3C8, 6);
380 clear_msr_range(0x3E0, 2);
381 clear_msr_range(MSR_P4_BPU_CCCR0, 18);
382 clear_msr_range(MSR_P4_BPU_PERFCTR0, 18);
383
384 wrmsrl(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0);
385 wrmsrl(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE);
386 write_watchdog_counter("P4_IQ_COUNTER0");
387 apic_write(APIC_LVTPC, APIC_DM_NMI);
388 wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val);
389 return 1;
390 }
391
setup_apic_nmi_watchdog(void)392 void setup_apic_nmi_watchdog(void)
393 {
394 if ( nmi_watchdog == NMI_NONE )
395 return;
396
397 switch (boot_cpu_data.x86_vendor) {
398 case X86_VENDOR_AMD:
399 switch (boot_cpu_data.x86) {
400 case 6:
401 case 0xf ... 0x19:
402 setup_k7_watchdog();
403 break;
404 default:
405 return;
406 }
407 break;
408 case X86_VENDOR_INTEL:
409 switch (boot_cpu_data.x86) {
410 case 6:
411 setup_p6_watchdog((boot_cpu_data.x86_model < 14)
412 ? P6_EVENT_CPU_CLOCKS_NOT_HALTED
413 : CORE_EVENT_CPU_CLOCKS_NOT_HALTED);
414 break;
415 case 15:
416 if (!setup_p4_watchdog())
417 return;
418 break;
419 default:
420 return;
421 }
422 break;
423 default:
424 return;
425 }
426
427 lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
428 nmi_active = 1;
429 }
430
cpu_nmi_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)431 static int cpu_nmi_callback(
432 struct notifier_block *nfb, unsigned long action, void *hcpu)
433 {
434 unsigned int cpu = (unsigned long)hcpu;
435
436 switch ( action )
437 {
438 case CPU_UP_PREPARE:
439 init_timer(&per_cpu(nmi_timer, cpu), nmi_timer_fn, NULL, cpu);
440 set_timer(&per_cpu(nmi_timer, cpu), NOW());
441 break;
442 case CPU_UP_CANCELED:
443 case CPU_DEAD:
444 kill_timer(&per_cpu(nmi_timer, cpu));
445 break;
446 default:
447 break;
448 }
449
450 return NOTIFY_DONE;
451 }
452
453 static struct notifier_block cpu_nmi_nfb = {
454 .notifier_call = cpu_nmi_callback
455 };
456
457 static DEFINE_PER_CPU(unsigned int, last_irq_sums);
458 static DEFINE_PER_CPU(unsigned int, alert_counter);
459
460 static atomic_t watchdog_disable_count = ATOMIC_INIT(1);
461
watchdog_disable(void)462 void watchdog_disable(void)
463 {
464 atomic_inc(&watchdog_disable_count);
465 }
466
watchdog_enable(void)467 void watchdog_enable(void)
468 {
469 atomic_dec(&watchdog_disable_count);
470 }
471
watchdog_enabled(void)472 bool watchdog_enabled(void)
473 {
474 return !atomic_read(&watchdog_disable_count);
475 }
476
watchdog_setup(void)477 int __init watchdog_setup(void)
478 {
479 unsigned int cpu;
480
481 /*
482 * Activate periodic heartbeats. We cannot do this earlier during
483 * setup because the timer infrastructure is not available.
484 */
485 for_each_online_cpu ( cpu )
486 cpu_nmi_callback(&cpu_nmi_nfb, CPU_UP_PREPARE, (void *)(long)cpu);
487 register_cpu_notifier(&cpu_nmi_nfb);
488
489 watchdog_enable();
490 return 0;
491 }
492
493 /* Returns false if this was not a watchdog NMI, true otherwise */
nmi_watchdog_tick(const struct cpu_user_regs * regs)494 bool nmi_watchdog_tick(const struct cpu_user_regs *regs)
495 {
496 bool watchdog_tick = true;
497 unsigned int sum = this_cpu(nmi_timer_ticks);
498
499 if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() )
500 {
501 /*
502 * Ayiee, looks like this CPU is stuck ... wait for the timeout
503 * before doing the oops ...
504 */
505 this_cpu(alert_counter)++;
506 if ( this_cpu(alert_counter) == opt_watchdog_timeout*nmi_hz )
507 {
508 console_force_unlock();
509 printk("Watchdog timer detects that CPU%d is stuck!\n",
510 smp_processor_id());
511 fatal_trap(regs, 1);
512 }
513 }
514 else
515 {
516 this_cpu(last_irq_sums) = sum;
517 this_cpu(alert_counter) = 0;
518 }
519
520 if ( nmi_perfctr_msr )
521 {
522 uint64_t msr_content;
523
524 /* Work out if this is a watchdog tick by checking for overflow. */
525 if ( nmi_perfctr_msr == MSR_P4_IQ_PERFCTR0 )
526 {
527 rdmsrl(MSR_P4_IQ_CCCR0, msr_content);
528 if ( !(msr_content & P4_CCCR_OVF) )
529 watchdog_tick = false;
530
531 /*
532 * P4 quirks:
533 * - An overflown perfctr will assert its interrupt
534 * until the OVF flag in its CCCR is cleared.
535 * - LVTPC is masked on interrupt and must be
536 * unmasked by the LVTPC handler.
537 */
538 wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val);
539 apic_write(APIC_LVTPC, APIC_DM_NMI);
540 }
541 else if ( nmi_perfctr_msr == MSR_P6_PERFCTR(0) )
542 {
543 rdmsrl(MSR_P6_PERFCTR(0), msr_content);
544 if ( msr_content & (1ULL << (nmi_p6_event_width - 1)) )
545 watchdog_tick = false;
546
547 /*
548 * Only P6 based Pentium M need to re-unmask the apic vector but
549 * it doesn't hurt other P6 variants.
550 */
551 apic_write(APIC_LVTPC, APIC_DM_NMI);
552 }
553 else if ( nmi_perfctr_msr == MSR_K7_PERFCTR0 )
554 {
555 rdmsrl(MSR_K7_PERFCTR0, msr_content);
556 if ( msr_content & (1ULL << K7_EVENT_WIDTH) )
557 watchdog_tick = false;
558 }
559 write_watchdog_counter(NULL);
560 }
561
562 return watchdog_tick;
563 }
564
565 /*
566 * For some reason the destination shorthand for self is not valid
567 * when used with the NMI delivery mode. This is documented in Tables
568 * 8-3 and 8-4 in IA32 Reference Manual Volume 3. We send the IPI to
569 * our own APIC ID explicitly which is valid.
570 */
self_nmi(void)571 void self_nmi(void)
572 {
573 unsigned long flags;
574 u32 id = get_apic_id();
575 local_irq_save(flags);
576 apic_wait_icr_idle();
577 apic_icr_write(APIC_DM_NMI | APIC_DEST_PHYSICAL, id);
578 local_irq_restore(flags);
579 }
580
do_nmi_trigger(unsigned char key)581 static void do_nmi_trigger(unsigned char key)
582 {
583 printk("Triggering NMI on APIC ID %x\n", get_apic_id());
584 self_nmi();
585 }
586
do_nmi_stats(unsigned char key)587 static void do_nmi_stats(unsigned char key)
588 {
589 const struct vcpu *v;
590 unsigned int cpu;
591 bool pend, mask;
592
593 printk("CPU\tNMI\n");
594 for_each_online_cpu ( cpu )
595 printk("%3u\t%3u\n", cpu, per_cpu(nmi_count, cpu));
596
597 if ( !hardware_domain || !(v = domain_vcpu(hardware_domain, 0)) )
598 return;
599
600 pend = v->arch.nmi_pending;
601 mask = v->arch.async_exception_mask & (1 << VCPU_TRAP_NMI);
602 if ( pend || mask )
603 printk("%pv: NMI%s%s\n",
604 v, pend ? " pending" : "", mask ? " masked" : "");
605 else
606 printk("%pv: NMI neither pending nor masked\n", v);
607 }
608
register_nmi_trigger(void)609 static __init int register_nmi_trigger(void)
610 {
611 register_keyhandler('N', do_nmi_trigger, "trigger an NMI", 0);
612 register_keyhandler('n', do_nmi_stats, "NMI statistics", 1);
613 return 0;
614 }
615 __initcall(register_nmi_trigger);
616