1 /*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 *
11 * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
12 */
13
14 #include <xen/init.h>
15 #include <xen/mm.h>
16 #include <xen/inttypes.h>
17 #include <xen/nodemask.h>
18 #include <xen/acpi.h>
19 #include <xen/numa.h>
20 #include <xen/pfn.h>
21 #include <asm/e820.h>
22 #include <asm/page.h>
23 #include <asm/spec_ctrl.h>
24
25 static struct acpi_table_slit *__read_mostly acpi_slit;
26
27 static nodemask_t memory_nodes_parsed __initdata;
28 static nodemask_t processor_nodes_parsed __initdata;
29 static struct node nodes[MAX_NUMNODES] __initdata;
30
31 struct pxm2node {
32 unsigned pxm;
33 nodeid_t node;
34 };
35 static struct pxm2node __read_mostly pxm2node[MAX_NUMNODES] =
36 { [0 ... MAX_NUMNODES - 1] = {.node = NUMA_NO_NODE} };
37
38 static unsigned node_to_pxm(nodeid_t n);
39
40 static int num_node_memblks;
41 static struct node node_memblk_range[NR_NODE_MEMBLKS];
42 static nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
43 static __initdata DECLARE_BITMAP(memblk_hotplug, NR_NODE_MEMBLKS);
44
node_found(unsigned idx,unsigned pxm)45 static inline bool node_found(unsigned idx, unsigned pxm)
46 {
47 return ((pxm2node[idx].pxm == pxm) &&
48 (pxm2node[idx].node != NUMA_NO_NODE));
49 }
50
pxm_to_node(unsigned pxm)51 nodeid_t pxm_to_node(unsigned pxm)
52 {
53 unsigned i;
54
55 if ((pxm < ARRAY_SIZE(pxm2node)) && node_found(pxm, pxm))
56 return pxm2node[pxm].node;
57
58 for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
59 if (node_found(i, pxm))
60 return pxm2node[i].node;
61
62 return NUMA_NO_NODE;
63 }
64
setup_node(unsigned pxm)65 nodeid_t setup_node(unsigned pxm)
66 {
67 nodeid_t node;
68 unsigned idx;
69 static bool warned;
70 static unsigned nodes_found;
71
72 BUILD_BUG_ON(MAX_NUMNODES >= NUMA_NO_NODE);
73
74 if (pxm < ARRAY_SIZE(pxm2node)) {
75 if (node_found(pxm, pxm))
76 return pxm2node[pxm].node;
77
78 /* Try to maintain indexing of pxm2node by pxm */
79 if (pxm2node[pxm].node == NUMA_NO_NODE) {
80 idx = pxm;
81 goto finish;
82 }
83 }
84
85 for (idx = 0; idx < ARRAY_SIZE(pxm2node); idx++)
86 if (pxm2node[idx].node == NUMA_NO_NODE)
87 goto finish;
88
89 if (!warned) {
90 printk(KERN_WARNING "SRAT: Too many proximity domains (%#x)\n",
91 pxm);
92 warned = true;
93 }
94
95 return NUMA_NO_NODE;
96
97 finish:
98 node = nodes_found++;
99 if (node >= MAX_NUMNODES)
100 return NUMA_NO_NODE;
101 pxm2node[idx].pxm = pxm;
102 pxm2node[idx].node = node;
103
104 return node;
105 }
106
valid_numa_range(u64 start,u64 end,nodeid_t node)107 int valid_numa_range(u64 start, u64 end, nodeid_t node)
108 {
109 int i;
110
111 for (i = 0; i < num_node_memblks; i++) {
112 struct node *nd = &node_memblk_range[i];
113
114 if (nd->start <= start && nd->end >= end &&
115 memblk_nodeid[i] == node)
116 return 1;
117 }
118
119 return 0;
120 }
121
conflicting_memblks(u64 start,u64 end)122 static __init int conflicting_memblks(u64 start, u64 end)
123 {
124 int i;
125
126 for (i = 0; i < num_node_memblks; i++) {
127 struct node *nd = &node_memblk_range[i];
128 if (nd->start == nd->end)
129 continue;
130 if (nd->end > start && nd->start < end)
131 return i;
132 if (nd->end == end && nd->start == start)
133 return i;
134 }
135 return -1;
136 }
137
cutoff_node(int i,u64 start,u64 end)138 static __init void cutoff_node(int i, u64 start, u64 end)
139 {
140 struct node *nd = &nodes[i];
141 if (nd->start < start) {
142 nd->start = start;
143 if (nd->end < nd->start)
144 nd->start = nd->end;
145 }
146 if (nd->end > end) {
147 nd->end = end;
148 if (nd->start > nd->end)
149 nd->start = nd->end;
150 }
151 }
152
bad_srat(void)153 static __init void bad_srat(void)
154 {
155 int i;
156 printk(KERN_ERR "SRAT: SRAT not used.\n");
157 acpi_numa = -1;
158 for (i = 0; i < MAX_LOCAL_APIC; i++)
159 apicid_to_node[i] = NUMA_NO_NODE;
160 for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
161 pxm2node[i].node = NUMA_NO_NODE;
162 mem_hotplug = 0;
163 }
164
165 /*
166 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
167 * up the NUMA heuristics which wants the local node to have a smaller
168 * distance than the others.
169 * Do some quick checks here and only use the SLIT if it passes.
170 */
slit_valid(struct acpi_table_slit * slit)171 static __init int slit_valid(struct acpi_table_slit *slit)
172 {
173 int i, j;
174 int d = slit->locality_count;
175 for (i = 0; i < d; i++) {
176 for (j = 0; j < d; j++) {
177 u8 val = slit->entry[d*i + j];
178 if (i == j) {
179 if (val != 10)
180 return 0;
181 } else if (val <= 10)
182 return 0;
183 }
184 }
185 return 1;
186 }
187
188 /* Callback for SLIT parsing */
acpi_numa_slit_init(struct acpi_table_slit * slit)189 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
190 {
191 mfn_t mfn;
192
193 if (!slit_valid(slit)) {
194 printk(KERN_INFO "ACPI: SLIT table looks invalid. "
195 "Not used.\n");
196 return;
197 }
198 mfn = alloc_boot_pages(PFN_UP(slit->header.length), 1);
199 acpi_slit = mfn_to_virt(mfn_x(mfn));
200 memcpy(acpi_slit, slit, slit->header.length);
201 }
202
203 /* Callback for Proximity Domain -> x2APIC mapping */
204 void __init
acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity * pa)205 acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity *pa)
206 {
207 unsigned pxm;
208 nodeid_t node;
209
210 if (srat_disabled())
211 return;
212 if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
213 bad_srat();
214 return;
215 }
216 if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
217 return;
218 if (pa->apic_id >= MAX_LOCAL_APIC) {
219 printk(KERN_INFO "SRAT: APIC %08x ignored\n", pa->apic_id);
220 return;
221 }
222
223 pxm = pa->proximity_domain;
224 node = setup_node(pxm);
225 if (node == NUMA_NO_NODE) {
226 bad_srat();
227 return;
228 }
229
230 apicid_to_node[pa->apic_id] = node;
231 node_set(node, processor_nodes_parsed);
232 acpi_numa = 1;
233 printk(KERN_INFO "SRAT: PXM %u -> APIC %08x -> Node %u\n",
234 pxm, pa->apic_id, node);
235 }
236
237 /* Callback for Proximity Domain -> LAPIC mapping */
238 void __init
acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity * pa)239 acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity *pa)
240 {
241 unsigned pxm;
242 nodeid_t node;
243
244 if (srat_disabled())
245 return;
246 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
247 bad_srat();
248 return;
249 }
250 if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
251 return;
252 pxm = pa->proximity_domain_lo;
253 if (srat_rev >= 2) {
254 pxm |= pa->proximity_domain_hi[0] << 8;
255 pxm |= pa->proximity_domain_hi[1] << 16;
256 pxm |= pa->proximity_domain_hi[2] << 24;
257 }
258 node = setup_node(pxm);
259 if (node == NUMA_NO_NODE) {
260 bad_srat();
261 return;
262 }
263 apicid_to_node[pa->apic_id] = node;
264 node_set(node, processor_nodes_parsed);
265 acpi_numa = 1;
266 printk(KERN_INFO "SRAT: PXM %u -> APIC %02x -> Node %u\n",
267 pxm, pa->apic_id, node);
268 }
269
270 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
271 void __init
acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity * ma)272 acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
273 {
274 u64 start, end;
275 unsigned pxm;
276 nodeid_t node;
277 int i;
278
279 if (srat_disabled())
280 return;
281 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
282 bad_srat();
283 return;
284 }
285 if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
286 return;
287
288 start = ma->base_address;
289 end = start + ma->length;
290 /* Supplement the heuristics in l1tf_calculations(). */
291 l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE));
292
293 if (num_node_memblks >= NR_NODE_MEMBLKS)
294 {
295 dprintk(XENLOG_WARNING,
296 "Too many numa entry, try bigger NR_NODE_MEMBLKS \n");
297 bad_srat();
298 return;
299 }
300
301 pxm = ma->proximity_domain;
302 if (srat_rev < 2)
303 pxm &= 0xff;
304 node = setup_node(pxm);
305 if (node == NUMA_NO_NODE) {
306 bad_srat();
307 return;
308 }
309 /* It is fine to add this area to the nodes data it will be used later*/
310 i = conflicting_memblks(start, end);
311 if (i < 0)
312 /* everything fine */;
313 else if (memblk_nodeid[i] == node) {
314 bool mismatch = !(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
315 !test_bit(i, memblk_hotplug);
316
317 printk("%sSRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with itself (%"PRIx64"-%"PRIx64")\n",
318 mismatch ? KERN_ERR : KERN_WARNING, pxm, start, end,
319 node_memblk_range[i].start, node_memblk_range[i].end);
320 if (mismatch) {
321 bad_srat();
322 return;
323 }
324 } else {
325 printk(KERN_ERR
326 "SRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with PXM %u (%"PRIx64"-%"PRIx64")\n",
327 pxm, start, end, node_to_pxm(memblk_nodeid[i]),
328 node_memblk_range[i].start, node_memblk_range[i].end);
329 bad_srat();
330 return;
331 }
332 if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
333 struct node *nd = &nodes[node];
334
335 if (!node_test_and_set(node, memory_nodes_parsed)) {
336 nd->start = start;
337 nd->end = end;
338 } else {
339 if (start < nd->start)
340 nd->start = start;
341 if (nd->end < end)
342 nd->end = end;
343 }
344 }
345 printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n",
346 node, pxm, start, end,
347 ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : "");
348
349 node_memblk_range[num_node_memblks].start = start;
350 node_memblk_range[num_node_memblks].end = end;
351 memblk_nodeid[num_node_memblks] = node;
352 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
353 __set_bit(num_node_memblks, memblk_hotplug);
354 if (end > mem_hotplug)
355 mem_hotplug = end;
356 }
357 num_node_memblks++;
358 }
359
360 /* Sanity check to catch more bad SRATs (they are amazingly common).
361 Make sure the PXMs cover all memory. */
nodes_cover_memory(void)362 static int __init nodes_cover_memory(void)
363 {
364 int i;
365
366 for (i = 0; i < e820.nr_map; i++) {
367 int j, found;
368 unsigned long long start, end;
369
370 if (e820.map[i].type != E820_RAM) {
371 continue;
372 }
373
374 start = e820.map[i].addr;
375 end = e820.map[i].addr + e820.map[i].size;
376
377 do {
378 found = 0;
379 for_each_node_mask(j, memory_nodes_parsed)
380 if (start < nodes[j].end
381 && end > nodes[j].start) {
382 if (start >= nodes[j].start) {
383 start = nodes[j].end;
384 found = 1;
385 }
386 if (end <= nodes[j].end) {
387 end = nodes[j].start;
388 found = 1;
389 }
390 }
391 } while (found && start < end);
392
393 if (start < end) {
394 printk(KERN_ERR "SRAT: No PXM for e820 range: "
395 "%016Lx - %016Lx\n", start, end);
396 return 0;
397 }
398 }
399 return 1;
400 }
401
acpi_numa_arch_fixup(void)402 void __init acpi_numa_arch_fixup(void) {}
403
404 static uint64_t __initdata srat_region_mask;
405
srat_parse_region(struct acpi_subtable_header * header,const unsigned long end)406 static int __init srat_parse_region(struct acpi_subtable_header *header,
407 const unsigned long end)
408 {
409 struct acpi_srat_mem_affinity *ma;
410
411 if (!header)
412 return -EINVAL;
413
414 ma = container_of(header, struct acpi_srat_mem_affinity, header);
415
416 if (!ma->length ||
417 !(ma->flags & ACPI_SRAT_MEM_ENABLED) ||
418 (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE))
419 return 0;
420
421 if (numa_off)
422 printk(KERN_INFO "SRAT: %013"PRIx64"-%013"PRIx64"\n",
423 ma->base_address, ma->base_address + ma->length - 1);
424
425 srat_region_mask |= ma->base_address |
426 pdx_region_mask(ma->base_address, ma->length);
427
428 return 0;
429 }
430
srat_parse_regions(u64 addr)431 void __init srat_parse_regions(u64 addr)
432 {
433 u64 mask;
434 unsigned int i;
435
436 if (acpi_disabled || acpi_numa < 0 ||
437 acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat))
438 return;
439
440 srat_region_mask = pdx_init_mask(addr);
441 acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
442 srat_parse_region, 0);
443
444 for (mask = srat_region_mask, i = 0; mask && i < e820.nr_map; i++) {
445 if (e820.map[i].type != E820_RAM)
446 continue;
447
448 if (~mask & pdx_region_mask(e820.map[i].addr, e820.map[i].size))
449 mask = 0;
450 }
451
452 pfn_pdx_hole_setup(mask >> PAGE_SHIFT);
453 }
454
455 /* Use the information discovered above to actually set up the nodes. */
acpi_scan_nodes(u64 start,u64 end)456 int __init acpi_scan_nodes(u64 start, u64 end)
457 {
458 int i;
459 nodemask_t all_nodes_parsed;
460
461 /* First clean up the node list */
462 for (i = 0; i < MAX_NUMNODES; i++)
463 cutoff_node(i, start, end);
464
465 if (acpi_numa <= 0)
466 return -1;
467
468 if (!nodes_cover_memory()) {
469 bad_srat();
470 return -1;
471 }
472
473 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
474 memblk_nodeid);
475
476 if (memnode_shift < 0) {
477 printk(KERN_ERR
478 "SRAT: No NUMA node hash function found. Contact maintainer\n");
479 bad_srat();
480 return -1;
481 }
482
483 nodes_or(all_nodes_parsed, memory_nodes_parsed, processor_nodes_parsed);
484
485 /* Finally register nodes */
486 for_each_node_mask(i, all_nodes_parsed)
487 {
488 u64 size = nodes[i].end - nodes[i].start;
489 if ( size == 0 )
490 printk(KERN_WARNING "SRAT: Node %u has no memory. "
491 "BIOS Bug or mis-configured hardware?\n", i);
492
493 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
494 }
495 for (i = 0; i < nr_cpu_ids; i++) {
496 if (cpu_to_node[i] == NUMA_NO_NODE)
497 continue;
498 if (!nodemask_test(cpu_to_node[i], &processor_nodes_parsed))
499 numa_set_node(i, NUMA_NO_NODE);
500 }
501 numa_init_array();
502 return 0;
503 }
504
node_to_pxm(nodeid_t n)505 static unsigned node_to_pxm(nodeid_t n)
506 {
507 unsigned i;
508
509 if ((n < ARRAY_SIZE(pxm2node)) && (pxm2node[n].node == n))
510 return pxm2node[n].pxm;
511 for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
512 if (pxm2node[i].node == n)
513 return pxm2node[i].pxm;
514 return 0;
515 }
516
__node_distance(nodeid_t a,nodeid_t b)517 u8 __node_distance(nodeid_t a, nodeid_t b)
518 {
519 unsigned index;
520 u8 slit_val;
521
522 if (!acpi_slit)
523 return a == b ? 10 : 20;
524 index = acpi_slit->locality_count * node_to_pxm(a);
525 slit_val = acpi_slit->entry[index + node_to_pxm(b)];
526
527 /* ACPI defines 0xff as an unreachable node and 0-9 are undefined */
528 if ((slit_val == 0xff) || (slit_val <= 9))
529 return NUMA_NO_DISTANCE;
530 else
531 return slit_val;
532 }
533
534 EXPORT_SYMBOL(__node_distance);
535