1 /*
2  * ACPI 3.0 based NUMA setup
3  * Copyright 2004 Andi Kleen, SuSE Labs.
4  *
5  * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6  *
7  * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8  * Assumes all memory regions belonging to a single proximity domain
9  * are in one chunk. Holes between them will be included in the node.
10  *
11  * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
12  */
13 
14 #include <xen/init.h>
15 #include <xen/mm.h>
16 #include <xen/inttypes.h>
17 #include <xen/nodemask.h>
18 #include <xen/acpi.h>
19 #include <xen/numa.h>
20 #include <xen/pfn.h>
21 #include <asm/e820.h>
22 #include <asm/page.h>
23 #include <asm/spec_ctrl.h>
24 
25 static struct acpi_table_slit *__read_mostly acpi_slit;
26 
27 static nodemask_t memory_nodes_parsed __initdata;
28 static nodemask_t processor_nodes_parsed __initdata;
29 static struct node nodes[MAX_NUMNODES] __initdata;
30 
31 struct pxm2node {
32 	unsigned pxm;
33 	nodeid_t node;
34 };
35 static struct pxm2node __read_mostly pxm2node[MAX_NUMNODES] =
36 	{ [0 ... MAX_NUMNODES - 1] = {.node = NUMA_NO_NODE} };
37 
38 static unsigned node_to_pxm(nodeid_t n);
39 
40 static int num_node_memblks;
41 static struct node node_memblk_range[NR_NODE_MEMBLKS];
42 static nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
43 static __initdata DECLARE_BITMAP(memblk_hotplug, NR_NODE_MEMBLKS);
44 
node_found(unsigned idx,unsigned pxm)45 static inline bool node_found(unsigned idx, unsigned pxm)
46 {
47 	return ((pxm2node[idx].pxm == pxm) &&
48 		(pxm2node[idx].node != NUMA_NO_NODE));
49 }
50 
pxm_to_node(unsigned pxm)51 nodeid_t pxm_to_node(unsigned pxm)
52 {
53 	unsigned i;
54 
55 	if ((pxm < ARRAY_SIZE(pxm2node)) && node_found(pxm, pxm))
56 		return pxm2node[pxm].node;
57 
58 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
59 		if (node_found(i, pxm))
60 			return pxm2node[i].node;
61 
62 	return NUMA_NO_NODE;
63 }
64 
setup_node(unsigned pxm)65 nodeid_t setup_node(unsigned pxm)
66 {
67 	nodeid_t node;
68 	unsigned idx;
69 	static bool warned;
70 	static unsigned nodes_found;
71 
72 	BUILD_BUG_ON(MAX_NUMNODES >= NUMA_NO_NODE);
73 
74 	if (pxm < ARRAY_SIZE(pxm2node)) {
75 		if (node_found(pxm, pxm))
76 			return pxm2node[pxm].node;
77 
78 		/* Try to maintain indexing of pxm2node by pxm */
79 		if (pxm2node[pxm].node == NUMA_NO_NODE) {
80 			idx = pxm;
81 			goto finish;
82 		}
83 	}
84 
85 	for (idx = 0; idx < ARRAY_SIZE(pxm2node); idx++)
86 		if (pxm2node[idx].node == NUMA_NO_NODE)
87 			goto finish;
88 
89 	if (!warned) {
90 		printk(KERN_WARNING "SRAT: Too many proximity domains (%#x)\n",
91 		       pxm);
92 		warned = true;
93 	}
94 
95 	return NUMA_NO_NODE;
96 
97  finish:
98 	node = nodes_found++;
99 	if (node >= MAX_NUMNODES)
100 		return NUMA_NO_NODE;
101 	pxm2node[idx].pxm = pxm;
102 	pxm2node[idx].node = node;
103 
104 	return node;
105 }
106 
valid_numa_range(u64 start,u64 end,nodeid_t node)107 int valid_numa_range(u64 start, u64 end, nodeid_t node)
108 {
109 	int i;
110 
111 	for (i = 0; i < num_node_memblks; i++) {
112 		struct node *nd = &node_memblk_range[i];
113 
114 		if (nd->start <= start && nd->end >= end &&
115 			memblk_nodeid[i] == node)
116 			return 1;
117 	}
118 
119 	return 0;
120 }
121 
conflicting_memblks(u64 start,u64 end)122 static __init int conflicting_memblks(u64 start, u64 end)
123 {
124 	int i;
125 
126 	for (i = 0; i < num_node_memblks; i++) {
127 		struct node *nd = &node_memblk_range[i];
128 		if (nd->start == nd->end)
129 			continue;
130 		if (nd->end > start && nd->start < end)
131 			return i;
132 		if (nd->end == end && nd->start == start)
133 			return i;
134 	}
135 	return -1;
136 }
137 
cutoff_node(int i,u64 start,u64 end)138 static __init void cutoff_node(int i, u64 start, u64 end)
139 {
140 	struct node *nd = &nodes[i];
141 	if (nd->start < start) {
142 		nd->start = start;
143 		if (nd->end < nd->start)
144 			nd->start = nd->end;
145 	}
146 	if (nd->end > end) {
147 		nd->end = end;
148 		if (nd->start > nd->end)
149 			nd->start = nd->end;
150 	}
151 }
152 
bad_srat(void)153 static __init void bad_srat(void)
154 {
155 	int i;
156 	printk(KERN_ERR "SRAT: SRAT not used.\n");
157 	acpi_numa = -1;
158 	for (i = 0; i < MAX_LOCAL_APIC; i++)
159 		apicid_to_node[i] = NUMA_NO_NODE;
160 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
161 		pxm2node[i].node = NUMA_NO_NODE;
162 	mem_hotplug = 0;
163 }
164 
165 /*
166  * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
167  * up the NUMA heuristics which wants the local node to have a smaller
168  * distance than the others.
169  * Do some quick checks here and only use the SLIT if it passes.
170  */
slit_valid(struct acpi_table_slit * slit)171 static __init int slit_valid(struct acpi_table_slit *slit)
172 {
173 	int i, j;
174 	int d = slit->locality_count;
175 	for (i = 0; i < d; i++) {
176 		for (j = 0; j < d; j++)  {
177 			u8 val = slit->entry[d*i + j];
178 			if (i == j) {
179 				if (val != 10)
180 					return 0;
181 			} else if (val <= 10)
182 				return 0;
183 		}
184 	}
185 	return 1;
186 }
187 
188 /* Callback for SLIT parsing */
acpi_numa_slit_init(struct acpi_table_slit * slit)189 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
190 {
191 	mfn_t mfn;
192 
193 	if (!slit_valid(slit)) {
194 		printk(KERN_INFO "ACPI: SLIT table looks invalid. "
195 		       "Not used.\n");
196 		return;
197 	}
198 	mfn = alloc_boot_pages(PFN_UP(slit->header.length), 1);
199 	acpi_slit = mfn_to_virt(mfn_x(mfn));
200 	memcpy(acpi_slit, slit, slit->header.length);
201 }
202 
203 /* Callback for Proximity Domain -> x2APIC mapping */
204 void __init
acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity * pa)205 acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity *pa)
206 {
207 	unsigned pxm;
208 	nodeid_t node;
209 
210 	if (srat_disabled())
211 		return;
212 	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
213 		bad_srat();
214 		return;
215 	}
216 	if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
217 		return;
218 	if (pa->apic_id >= MAX_LOCAL_APIC) {
219 		printk(KERN_INFO "SRAT: APIC %08x ignored\n", pa->apic_id);
220 		return;
221 	}
222 
223 	pxm = pa->proximity_domain;
224 	node = setup_node(pxm);
225 	if (node == NUMA_NO_NODE) {
226 		bad_srat();
227 		return;
228 	}
229 
230 	apicid_to_node[pa->apic_id] = node;
231 	node_set(node, processor_nodes_parsed);
232 	acpi_numa = 1;
233 	printk(KERN_INFO "SRAT: PXM %u -> APIC %08x -> Node %u\n",
234 	       pxm, pa->apic_id, node);
235 }
236 
237 /* Callback for Proximity Domain -> LAPIC mapping */
238 void __init
acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity * pa)239 acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity *pa)
240 {
241 	unsigned pxm;
242 	nodeid_t node;
243 
244 	if (srat_disabled())
245 		return;
246 	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
247 		bad_srat();
248 		return;
249 	}
250 	if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
251 		return;
252 	pxm = pa->proximity_domain_lo;
253 	if (srat_rev >= 2) {
254 		pxm |= pa->proximity_domain_hi[0] << 8;
255 		pxm |= pa->proximity_domain_hi[1] << 16;
256 		pxm |= pa->proximity_domain_hi[2] << 24;
257 	}
258 	node = setup_node(pxm);
259 	if (node == NUMA_NO_NODE) {
260 		bad_srat();
261 		return;
262 	}
263 	apicid_to_node[pa->apic_id] = node;
264 	node_set(node, processor_nodes_parsed);
265 	acpi_numa = 1;
266 	printk(KERN_INFO "SRAT: PXM %u -> APIC %02x -> Node %u\n",
267 	       pxm, pa->apic_id, node);
268 }
269 
270 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
271 void __init
acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity * ma)272 acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
273 {
274 	u64 start, end;
275 	unsigned pxm;
276 	nodeid_t node;
277 	int i;
278 
279 	if (srat_disabled())
280 		return;
281 	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
282 		bad_srat();
283 		return;
284 	}
285 	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
286 		return;
287 
288 	start = ma->base_address;
289 	end = start + ma->length;
290 	/* Supplement the heuristics in l1tf_calculations(). */
291 	l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE));
292 
293 	if (num_node_memblks >= NR_NODE_MEMBLKS)
294 	{
295 		dprintk(XENLOG_WARNING,
296                 "Too many numa entry, try bigger NR_NODE_MEMBLKS \n");
297 		bad_srat();
298 		return;
299 	}
300 
301 	pxm = ma->proximity_domain;
302 	if (srat_rev < 2)
303 		pxm &= 0xff;
304 	node = setup_node(pxm);
305 	if (node == NUMA_NO_NODE) {
306 		bad_srat();
307 		return;
308 	}
309 	/* It is fine to add this area to the nodes data it will be used later*/
310 	i = conflicting_memblks(start, end);
311 	if (i < 0)
312 		/* everything fine */;
313 	else if (memblk_nodeid[i] == node) {
314 		bool mismatch = !(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
315 		                !test_bit(i, memblk_hotplug);
316 
317 		printk("%sSRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with itself (%"PRIx64"-%"PRIx64")\n",
318 		       mismatch ? KERN_ERR : KERN_WARNING, pxm, start, end,
319 		       node_memblk_range[i].start, node_memblk_range[i].end);
320 		if (mismatch) {
321 			bad_srat();
322 			return;
323 		}
324 	} else {
325 		printk(KERN_ERR
326 		       "SRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with PXM %u (%"PRIx64"-%"PRIx64")\n",
327 		       pxm, start, end, node_to_pxm(memblk_nodeid[i]),
328 		       node_memblk_range[i].start, node_memblk_range[i].end);
329 		bad_srat();
330 		return;
331 	}
332 	if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
333 		struct node *nd = &nodes[node];
334 
335 		if (!node_test_and_set(node, memory_nodes_parsed)) {
336 			nd->start = start;
337 			nd->end = end;
338 		} else {
339 			if (start < nd->start)
340 				nd->start = start;
341 			if (nd->end < end)
342 				nd->end = end;
343 		}
344 	}
345 	printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n",
346 	       node, pxm, start, end,
347 	       ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : "");
348 
349 	node_memblk_range[num_node_memblks].start = start;
350 	node_memblk_range[num_node_memblks].end = end;
351 	memblk_nodeid[num_node_memblks] = node;
352 	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
353 		__set_bit(num_node_memblks, memblk_hotplug);
354 		if (end > mem_hotplug)
355 			mem_hotplug = end;
356 	}
357 	num_node_memblks++;
358 }
359 
360 /* Sanity check to catch more bad SRATs (they are amazingly common).
361    Make sure the PXMs cover all memory. */
nodes_cover_memory(void)362 static int __init nodes_cover_memory(void)
363 {
364 	int i;
365 
366 	for (i = 0; i < e820.nr_map; i++) {
367 		int j, found;
368 		unsigned long long start, end;
369 
370 		if (e820.map[i].type != E820_RAM) {
371 			continue;
372 		}
373 
374 		start = e820.map[i].addr;
375 		end = e820.map[i].addr + e820.map[i].size;
376 
377 		do {
378 			found = 0;
379 			for_each_node_mask(j, memory_nodes_parsed)
380 				if (start < nodes[j].end
381 				    && end > nodes[j].start) {
382 					if (start >= nodes[j].start) {
383 						start = nodes[j].end;
384 						found = 1;
385 					}
386 					if (end <= nodes[j].end) {
387 						end = nodes[j].start;
388 						found = 1;
389 					}
390 				}
391 		} while (found && start < end);
392 
393 		if (start < end) {
394 			printk(KERN_ERR "SRAT: No PXM for e820 range: "
395 				"%016Lx - %016Lx\n", start, end);
396 			return 0;
397 		}
398 	}
399 	return 1;
400 }
401 
acpi_numa_arch_fixup(void)402 void __init acpi_numa_arch_fixup(void) {}
403 
404 static uint64_t __initdata srat_region_mask;
405 
srat_parse_region(struct acpi_subtable_header * header,const unsigned long end)406 static int __init srat_parse_region(struct acpi_subtable_header *header,
407 				    const unsigned long end)
408 {
409 	struct acpi_srat_mem_affinity *ma;
410 
411 	if (!header)
412 		return -EINVAL;
413 
414 	ma = container_of(header, struct acpi_srat_mem_affinity, header);
415 
416 	if (!ma->length ||
417 	    !(ma->flags & ACPI_SRAT_MEM_ENABLED) ||
418 	    (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE))
419 		return 0;
420 
421 	if (numa_off)
422 		printk(KERN_INFO "SRAT: %013"PRIx64"-%013"PRIx64"\n",
423 		       ma->base_address, ma->base_address + ma->length - 1);
424 
425 	srat_region_mask |= ma->base_address |
426 			    pdx_region_mask(ma->base_address, ma->length);
427 
428 	return 0;
429 }
430 
srat_parse_regions(u64 addr)431 void __init srat_parse_regions(u64 addr)
432 {
433 	u64 mask;
434 	unsigned int i;
435 
436 	if (acpi_disabled || acpi_numa < 0 ||
437 	    acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat))
438 		return;
439 
440 	srat_region_mask = pdx_init_mask(addr);
441 	acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
442 			      srat_parse_region, 0);
443 
444 	for (mask = srat_region_mask, i = 0; mask && i < e820.nr_map; i++) {
445 		if (e820.map[i].type != E820_RAM)
446 			continue;
447 
448 		if (~mask & pdx_region_mask(e820.map[i].addr, e820.map[i].size))
449 			mask = 0;
450 	}
451 
452 	pfn_pdx_hole_setup(mask >> PAGE_SHIFT);
453 }
454 
455 /* Use the information discovered above to actually set up the nodes. */
acpi_scan_nodes(u64 start,u64 end)456 int __init acpi_scan_nodes(u64 start, u64 end)
457 {
458 	int i;
459 	nodemask_t all_nodes_parsed;
460 
461 	/* First clean up the node list */
462 	for (i = 0; i < MAX_NUMNODES; i++)
463 		cutoff_node(i, start, end);
464 
465 	if (acpi_numa <= 0)
466 		return -1;
467 
468 	if (!nodes_cover_memory()) {
469 		bad_srat();
470 		return -1;
471 	}
472 
473 	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
474 				memblk_nodeid);
475 
476 	if (memnode_shift < 0) {
477 		printk(KERN_ERR
478 		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
479 		bad_srat();
480 		return -1;
481 	}
482 
483 	nodes_or(all_nodes_parsed, memory_nodes_parsed, processor_nodes_parsed);
484 
485 	/* Finally register nodes */
486 	for_each_node_mask(i, all_nodes_parsed)
487 	{
488 		u64 size = nodes[i].end - nodes[i].start;
489 		if ( size == 0 )
490 			printk(KERN_WARNING "SRAT: Node %u has no memory. "
491 			       "BIOS Bug or mis-configured hardware?\n", i);
492 
493 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
494 	}
495 	for (i = 0; i < nr_cpu_ids; i++) {
496 		if (cpu_to_node[i] == NUMA_NO_NODE)
497 			continue;
498 		if (!nodemask_test(cpu_to_node[i], &processor_nodes_parsed))
499 			numa_set_node(i, NUMA_NO_NODE);
500 	}
501 	numa_init_array();
502 	return 0;
503 }
504 
node_to_pxm(nodeid_t n)505 static unsigned node_to_pxm(nodeid_t n)
506 {
507 	unsigned i;
508 
509 	if ((n < ARRAY_SIZE(pxm2node)) && (pxm2node[n].node == n))
510 		return pxm2node[n].pxm;
511 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
512 		if (pxm2node[i].node == n)
513 			return pxm2node[i].pxm;
514 	return 0;
515 }
516 
__node_distance(nodeid_t a,nodeid_t b)517 u8 __node_distance(nodeid_t a, nodeid_t b)
518 {
519 	unsigned index;
520 	u8 slit_val;
521 
522 	if (!acpi_slit)
523 		return a == b ? 10 : 20;
524 	index = acpi_slit->locality_count * node_to_pxm(a);
525 	slit_val = acpi_slit->entry[index + node_to_pxm(b)];
526 
527 	/* ACPI defines 0xff as an unreachable node and 0-9 are undefined */
528 	if ((slit_val == 0xff) || (slit_val <= 9))
529 		return NUMA_NO_DISTANCE;
530 	else
531 		return slit_val;
532 }
533 
534 EXPORT_SYMBOL(__node_distance);
535