1 #include <xen/init.h>
2 #include <xen/kernel.h>
3 #include <xen/string.h>
4 #include <xen/bitops.h>
5 #include <xen/smp.h>
6 #include <asm/processor.h>
7 #include <asm/msr.h>
8 #include <asm/uaccess.h>
9 #include <asm/mpspec.h>
10 #include <asm/apic.h>
11 #include <asm/i387.h>
12 #include <mach_apic.h>
13 #include <asm/hvm/support.h>
14 
15 #include "cpu.h"
16 
17 /*
18  * Processors which have self-snooping capability can handle conflicting
19  * memory type across CPUs by snooping its own cache. However, there exists
20  * CPU models in which having conflicting memory types still leads to
21  * unpredictable behavior, machine check errors, or hangs. Clear this
22  * feature to prevent its use on machines with known erratas.
23  */
check_memory_type_self_snoop_errata(void)24 static void __init check_memory_type_self_snoop_errata(void)
25 {
26 	if (!boot_cpu_has(X86_FEATURE_SS))
27 		return;
28 
29 	switch (boot_cpu_data.x86_model) {
30 	case 0x0f: /* Merom */
31 	case 0x16: /* Merom L */
32 	case 0x17: /* Penryn */
33 	case 0x1d: /* Dunnington */
34 	case 0x1e: /* Nehalem */
35 	case 0x1f: /* Auburndale / Havendale */
36 	case 0x1a: /* Nehalem EP */
37 	case 0x2e: /* Nehalem EX */
38 	case 0x25: /* Westmere */
39 	case 0x2c: /* Westmere EP */
40 	case 0x2a: /* SandyBridge */
41 		return;
42 	}
43 
44 	setup_force_cpu_cap(X86_FEATURE_XEN_SELFSNOOP);
45 }
46 
47 /*
48  * Set caps in expected_levelling_cap, probe a specific masking MSR, and set
49  * caps in levelling_caps if it is found, or clobber the MSR index if missing.
50  * If preset, reads the default value into msr_val.
51  */
_probe_mask_msr(unsigned int * msr,uint64_t caps)52 static uint64_t __init _probe_mask_msr(unsigned int *msr, uint64_t caps)
53 {
54 	uint64_t val = 0;
55 
56 	expected_levelling_cap |= caps;
57 
58 	if (rdmsr_safe(*msr, val) || wrmsr_safe(*msr, val))
59 		*msr = 0;
60 	else
61 		levelling_caps |= caps;
62 
63 	return val;
64 }
65 
66 /* Indices of the masking MSRs, or 0 if unavailable. */
67 static unsigned int __read_mostly msr_basic, __read_mostly msr_ext,
68 	__read_mostly msr_xsave;
69 
70 /*
71  * Probe for the existance of the expected masking MSRs.  They might easily
72  * not be available if Xen is running virtualised.
73  */
probe_masking_msrs(void)74 static void __init probe_masking_msrs(void)
75 {
76 	const struct cpuinfo_x86 *c = &boot_cpu_data;
77 	unsigned int exp_msr_basic, exp_msr_ext, exp_msr_xsave;
78 
79 	/* Only family 6 supports this feature. */
80 	if (c->x86 != 6)
81 		return;
82 
83 	switch (c->x86_model) {
84 	case 0x17: /* Yorkfield, Wolfdale, Penryn, Harpertown(DP) */
85 	case 0x1d: /* Dunnington(MP) */
86 		msr_basic = MSR_INTEL_MASK_V1_CPUID1;
87 		break;
88 
89 	case 0x1a: /* Bloomfield, Nehalem-EP(Gainestown) */
90 	case 0x1e: /* Clarksfield, Lynnfield, Jasper Forest */
91 	case 0x1f: /* Something Nehalem-based - perhaps Auburndale/Havendale? */
92 	case 0x25: /* Arrandale, Clarksdale */
93 	case 0x2c: /* Gulftown, Westmere-EP */
94 	case 0x2e: /* Nehalem-EX(Beckton) */
95 	case 0x2f: /* Westmere-EX */
96 		msr_basic = MSR_INTEL_MASK_V2_CPUID1;
97 		msr_ext   = MSR_INTEL_MASK_V2_CPUID80000001;
98 		break;
99 
100 	case 0x2a: /* SandyBridge */
101 	case 0x2d: /* SandyBridge-E, SandyBridge-EN, SandyBridge-EP */
102 		msr_basic = MSR_INTEL_MASK_V3_CPUID1;
103 		msr_ext   = MSR_INTEL_MASK_V3_CPUID80000001;
104 		msr_xsave = MSR_INTEL_MASK_V3_CPUIDD_01;
105 		break;
106 	}
107 
108 	exp_msr_basic = msr_basic;
109 	exp_msr_ext   = msr_ext;
110 	exp_msr_xsave = msr_xsave;
111 
112 	if (msr_basic)
113 		cpuidmask_defaults._1cd = _probe_mask_msr(&msr_basic, LCAP_1cd);
114 
115 	if (msr_ext)
116 		cpuidmask_defaults.e1cd = _probe_mask_msr(&msr_ext, LCAP_e1cd);
117 
118 	if (msr_xsave)
119 		cpuidmask_defaults.Da1 = _probe_mask_msr(&msr_xsave, LCAP_Da1);
120 
121 	/*
122 	 * Don't bother warning about a mismatch if virtualised.  These MSRs
123 	 * are not architectural and almost never virtualised.
124 	 */
125 	if ((expected_levelling_cap == levelling_caps) ||
126 	    cpu_has_hypervisor)
127 		return;
128 
129 	printk(XENLOG_WARNING "Mismatch between expected (%#x) "
130 	       "and real (%#x) levelling caps: missing %#x\n",
131 	       expected_levelling_cap, levelling_caps,
132 	       (expected_levelling_cap ^ levelling_caps) & levelling_caps);
133 	printk(XENLOG_WARNING "Fam %#x, model %#x expected (%#x/%#x/%#x), "
134 	       "got (%#x/%#x/%#x)\n", c->x86, c->x86_model,
135 	       exp_msr_basic, exp_msr_ext, exp_msr_xsave,
136 	       msr_basic, msr_ext, msr_xsave);
137 	printk(XENLOG_WARNING
138 	       "If not running virtualised, please report a bug\n");
139 }
140 
141 /*
142  * Context switch CPUID masking state to the next domain.  Only called if
143  * CPUID Faulting isn't available, but masking MSRs have been detected.  A
144  * parameter of NULL is used to context switch to the default host state (by
145  * the cpu bringup-code, crash path, etc).
146  */
intel_ctxt_switch_masking(const struct vcpu * next)147 static void intel_ctxt_switch_masking(const struct vcpu *next)
148 {
149 	struct cpuidmasks *these_masks = &this_cpu(cpuidmasks);
150 	const struct domain *nextd = next ? next->domain : NULL;
151 	const struct cpuidmasks *masks =
152 		(nextd && is_pv_domain(nextd) && nextd->arch.pv.cpuidmasks)
153 		? nextd->arch.pv.cpuidmasks : &cpuidmask_defaults;
154 
155         if (msr_basic) {
156 		uint64_t val = masks->_1cd;
157 
158 		/*
159 		 * OSXSAVE defaults to 1, which causes fast-forwarding of
160 		 * Xen's real setting.  Clobber it if disabled by the guest
161 		 * kernel.
162 		 */
163 		if (next && is_pv_vcpu(next) && !is_idle_vcpu(next) &&
164 		    !(next->arch.pv.ctrlreg[4] & X86_CR4_OSXSAVE))
165 			val &= ~(uint64_t)cpufeat_mask(X86_FEATURE_OSXSAVE);
166 
167 		if (unlikely(these_masks->_1cd != val)) {
168 			wrmsrl(msr_basic, val);
169 			these_masks->_1cd = val;
170 		}
171         }
172 
173 #define LAZY(msr, field)						\
174 	({								\
175 		if (unlikely(these_masks->field != masks->field) &&	\
176 		    (msr))						\
177 		{							\
178 			wrmsrl((msr), masks->field);			\
179 			these_masks->field = masks->field;		\
180 		}							\
181 	})
182 
183 	LAZY(msr_ext,   e1cd);
184 	LAZY(msr_xsave, Da1);
185 
186 #undef LAZY
187 }
188 
189 /*
190  * opt_cpuid_mask_ecx/edx: cpuid.1[ecx, edx] feature mask.
191  * For example, E8400[Intel Core 2 Duo Processor series] ecx = 0x0008E3FD,
192  * edx = 0xBFEBFBFF when executing CPUID.EAX = 1 normally. If you want to
193  * 'rev down' to E8400, you can set these values in these Xen boot parameters.
194  */
intel_init_levelling(void)195 static void __init noinline intel_init_levelling(void)
196 {
197 	if (probe_cpuid_faulting())
198 		return;
199 
200 	probe_masking_msrs();
201 
202 	if (msr_basic) {
203 		uint32_t ecx, edx, tmp;
204 
205 		cpuid(0x00000001, &tmp, &tmp, &ecx, &edx);
206 
207 		ecx &= opt_cpuid_mask_ecx;
208 		edx &= opt_cpuid_mask_edx;
209 
210 		/* Fast-forward bits - Must be set. */
211 		if (ecx & cpufeat_mask(X86_FEATURE_XSAVE))
212 			ecx |= cpufeat_mask(X86_FEATURE_OSXSAVE);
213 		edx |= cpufeat_mask(X86_FEATURE_APIC);
214 
215 		cpuidmask_defaults._1cd &= ((u64)edx << 32) | ecx;
216 	}
217 
218 	if (msr_ext) {
219 		uint32_t ecx, edx, tmp;
220 
221 		cpuid(0x80000001, &tmp, &tmp, &ecx, &edx);
222 
223 		ecx &= opt_cpuid_mask_ext_ecx;
224 		edx &= opt_cpuid_mask_ext_edx;
225 
226 		cpuidmask_defaults.e1cd &= ((u64)edx << 32) | ecx;
227 	}
228 
229 	if (msr_xsave) {
230 		uint32_t eax, tmp;
231 
232 		cpuid_count(0x0000000d, 1, &eax, &tmp, &tmp, &tmp);
233 
234 		eax &= opt_cpuid_mask_xsave_eax;
235 
236 		cpuidmask_defaults.Da1 &= (~0ULL << 32) | eax;
237 	}
238 
239 	if (opt_cpu_info) {
240 		printk(XENLOG_INFO "Levelling caps: %#x\n", levelling_caps);
241 
242 		if (!cpu_has_cpuid_faulting)
243 			printk(XENLOG_INFO
244 			       "MSR defaults: 1d 0x%08x, 1c 0x%08x, e1d 0x%08x, "
245 			       "e1c 0x%08x, Da1 0x%08x\n",
246 			       (uint32_t)(cpuidmask_defaults._1cd >> 32),
247 			       (uint32_t)cpuidmask_defaults._1cd,
248 			       (uint32_t)(cpuidmask_defaults.e1cd >> 32),
249 			       (uint32_t)cpuidmask_defaults.e1cd,
250 			       (uint32_t)cpuidmask_defaults.Da1);
251 	}
252 
253 	if (levelling_caps)
254 		ctxt_switch_masking = intel_ctxt_switch_masking;
255 }
256 
early_init_intel(struct cpuinfo_x86 * c)257 static void early_init_intel(struct cpuinfo_x86 *c)
258 {
259 	u64 misc_enable, disable;
260 
261 	/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
262 	if (c->x86 == 15 && c->x86_cache_alignment == 64)
263 		c->x86_cache_alignment = 128;
264 
265 	/* Unmask CPUID levels and NX if masked: */
266 	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
267 
268 	disable = misc_enable & (MSR_IA32_MISC_ENABLE_LIMIT_CPUID |
269 				 MSR_IA32_MISC_ENABLE_XD_DISABLE);
270 	if (disable) {
271 		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
272 		bootsym(trampoline_misc_enable_off) |= disable;
273 		bootsym(trampoline_efer) |= EFER_NX;
274 	}
275 
276 	if (disable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID)
277 		printk(KERN_INFO "revised cpuid level: %d\n",
278 		       cpuid_eax(0));
279 	if (disable & MSR_IA32_MISC_ENABLE_XD_DISABLE) {
280 		write_efer(read_efer() | EFER_NX);
281 		printk(KERN_INFO
282 		       "re-enabled NX (Execute Disable) protection\n");
283 	}
284 
285 	/* CPUID workaround for Intel 0F33/0F34 CPU */
286 	if (boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 3 &&
287 	    (boot_cpu_data.x86_mask == 3 || boot_cpu_data.x86_mask == 4))
288 		paddr_bits = 36;
289 
290 	if (c == &boot_cpu_data) {
291 		check_memory_type_self_snoop_errata();
292 
293 		intel_init_levelling();
294 	}
295 
296 	ctxt_switch_levelling(NULL);
297 }
298 
299 /*
300  * Errata BA80, AAK120, AAM108, AAO67, BD59, AAY54: Rapid Core C3/C6 Transition
301  * May Cause Unpredictable System Behavior
302  *
303  * Under a complex set of internal conditions, cores rapidly performing C3/C6
304  * transitions in a system with Intel Hyper-Threading Technology enabled may
305  * cause a machine check error (IA32_MCi_STATUS.MCACOD = 0x0106), system hang
306  * or unpredictable system behavior.
307  */
probe_c3_errata(const struct cpuinfo_x86 * c)308 static void probe_c3_errata(const struct cpuinfo_x86 *c)
309 {
310 #define INTEL_FAM6_MODEL(m) { X86_VENDOR_INTEL, 6, m, X86_FEATURE_ALWAYS }
311     static const struct x86_cpu_id models[] = {
312         /* Nehalem */
313         INTEL_FAM6_MODEL(0x1a),
314         INTEL_FAM6_MODEL(0x1e),
315         INTEL_FAM6_MODEL(0x1f),
316         INTEL_FAM6_MODEL(0x2e),
317         /* Westmere (note Westmere-EX is not affected) */
318         INTEL_FAM6_MODEL(0x2c),
319         INTEL_FAM6_MODEL(0x25),
320         { }
321     };
322 #undef INTEL_FAM6_MODEL
323 
324     /* Serialized by the AP bringup code. */
325     if ( max_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
326          x86_match_cpu(models) )
327     {
328         printk(XENLOG_WARNING
329 	       "Disabling C-states C3 and C6 due to CPU errata\n");
330         max_cstate = 1;
331     }
332 }
333 
334 /*
335  * P4 Xeon errata 037 workaround.
336  * Hardware prefetcher may cause stale data to be loaded into the cache.
337  *
338  * Xeon 7400 erratum AAI65 (and further newer Xeons)
339  * MONITOR/MWAIT may have excessive false wakeups
340  */
Intel_errata_workarounds(struct cpuinfo_x86 * c)341 static void Intel_errata_workarounds(struct cpuinfo_x86 *c)
342 {
343 	unsigned long lo, hi;
344 
345 	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
346 		rdmsr (MSR_IA32_MISC_ENABLE, lo, hi);
347 		if ((lo & (1<<9)) == 0) {
348 			printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
349 			printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
350 			lo |= (1<<9);	/* Disable hw prefetching */
351 			wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
352 		}
353 	}
354 
355 	if (c->x86 == 6 && cpu_has_clflush &&
356 	    (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
357 		__set_bit(X86_FEATURE_CLFLUSH_MONITOR, c->x86_capability);
358 
359 	if (cpu_has_tsx_force_abort && opt_rtm_abort)
360 		wrmsrl(MSR_TSX_FORCE_ABORT, TSX_FORCE_ABORT_RTM);
361 
362 	probe_c3_errata(c);
363 }
364 
365 
366 /*
367  * find out the number of processor cores on the die
368  */
num_cpu_cores(struct cpuinfo_x86 * c)369 static int num_cpu_cores(struct cpuinfo_x86 *c)
370 {
371 	unsigned int eax, ebx, ecx, edx;
372 
373 	if (c->cpuid_level < 4)
374 		return 1;
375 
376 	/* Intel has a non-standard dependency on %ecx for this CPUID level. */
377 	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
378 	if (eax & 0x1f)
379 		return ((eax >> 26) + 1);
380 	else
381 		return 1;
382 }
383 
intel_log_freq(const struct cpuinfo_x86 * c)384 static void intel_log_freq(const struct cpuinfo_x86 *c)
385 {
386     unsigned int eax, ebx, ecx, edx;
387     uint64_t msrval;
388     uint8_t max_ratio;
389 
390     if ( c->cpuid_level >= 0x15 )
391     {
392         cpuid(0x15, &eax, &ebx, &ecx, &edx);
393         if ( ecx && ebx && eax )
394         {
395             unsigned long long val = ecx;
396 
397             val *= ebx;
398             do_div(val, eax);
399             printk("CPU%u: TSC: %u Hz * %u / %u = %Lu Hz\n",
400                    smp_processor_id(), ecx, ebx, eax, val);
401         }
402         else if ( ecx | eax | ebx )
403         {
404             printk("CPU%u: TSC:", smp_processor_id());
405             if ( ecx )
406                 printk(" core: %u Hz", ecx);
407             if ( ebx && eax )
408                 printk(" ratio: %u / %u", ebx, eax);
409             printk("\n");
410         }
411     }
412 
413     if ( c->cpuid_level >= 0x16 )
414     {
415         cpuid(0x16, &eax, &ebx, &ecx, &edx);
416         if ( ecx | eax | ebx )
417         {
418             printk("CPU%u:", smp_processor_id());
419             if ( ecx )
420                 printk(" bus: %u MHz", ecx);
421             if ( eax )
422                 printk(" base: %u MHz", eax);
423             if ( ebx )
424                 printk(" max: %u MHz", ebx);
425             printk("\n");
426         }
427     }
428 
429     if ( rdmsr_safe(MSR_INTEL_PLATFORM_INFO, msrval) )
430         return;
431     max_ratio = msrval >> 8;
432 
433     if ( max_ratio )
434     {
435         unsigned int factor = 10000;
436         uint8_t min_ratio = msrval >> 40;
437 
438         if ( c->x86 == 6 )
439             switch ( c->x86_model )
440             {
441             case 0x1a: case 0x1e: case 0x1f: case 0x2e: /* Nehalem */
442             case 0x25: case 0x2c: case 0x2f: /* Westmere */
443                 factor = 13333;
444                 break;
445             }
446 
447         printk("CPU%u: ", smp_processor_id());
448         if ( min_ratio )
449             printk("%u ... ", (factor * min_ratio + 50) / 100);
450         printk("%u MHz\n", (factor * max_ratio + 50) / 100);
451     }
452 }
453 
init_intel(struct cpuinfo_x86 * c)454 static void init_intel(struct cpuinfo_x86 *c)
455 {
456 	/* Detect the extended topology information if available */
457 	detect_extended_topology(c);
458 
459 	init_intel_cacheinfo(c);
460 	if (c->cpuid_level > 9) {
461 		unsigned eax = cpuid_eax(10);
462 		/* Check for version and the number of counters */
463 		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
464 			__set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability);
465 	}
466 
467 	if ( !cpu_has(c, X86_FEATURE_XTOPOLOGY) )
468 	{
469 		c->x86_max_cores = num_cpu_cores(c);
470 		detect_ht(c);
471 	}
472 
473 	/* Work around errata */
474 	Intel_errata_workarounds(c);
475 
476 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
477 		(c->x86 == 0x6 && c->x86_model >= 0x0e))
478 		__set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
479 	if (cpu_has(c, X86_FEATURE_ITSC)) {
480 		__set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
481 		__set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
482 		__set_bit(X86_FEATURE_TSC_RELIABLE, c->x86_capability);
483 	}
484 	if ( opt_arat &&
485 	     ( c->cpuid_level >= 0x00000006 ) &&
486 	     ( cpuid_eax(0x00000006) & (1u<<2) ) )
487 		__set_bit(X86_FEATURE_ARAT, c->x86_capability);
488 
489 	if ((opt_cpu_info && !(c->apicid & (c->x86_num_siblings - 1))) ||
490 	    c == &boot_cpu_data )
491 		intel_log_freq(c);
492 }
493 
494 const struct cpu_dev intel_cpu_dev = {
495 	.c_early_init	= early_init_intel,
496 	.c_init		= init_intel,
497 };
498