1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
5 */
6
7 #include <xen/mm.h>
8 #include <xen/string.h>
9 #include <xen/init.h>
10 #include <xen/ctype.h>
11 #include <xen/nodemask.h>
12 #include <xen/numa.h>
13 #include <xen/keyhandler.h>
14 #include <xen/param.h>
15 #include <xen/time.h>
16 #include <xen/smp.h>
17 #include <xen/pfn.h>
18 #include <asm/acpi.h>
19 #include <xen/sched.h>
20 #include <xen/softirq.h>
21
22 static int numa_setup(const char *s);
23 custom_param("numa", numa_setup);
24
25 #ifndef Dprintk
26 #define Dprintk(x...)
27 #endif
28
29 /* from proto.h */
30 #define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
31
32 struct node_data node_data[MAX_NUMNODES];
33
34 /* Mapping from pdx to node id */
35 int memnode_shift;
36 static typeof(*memnodemap) _memnodemap[64];
37 unsigned long memnodemapsize;
38 u8 *memnodemap;
39
40 nodeid_t cpu_to_node[NR_CPUS] __read_mostly = {
41 [0 ... NR_CPUS-1] = NUMA_NO_NODE
42 };
43 /*
44 * Keep BIOS's CPU2node information, should not be used for memory allocaion
45 */
46 nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
47 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
48 };
49 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
50
51 nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
52
53 bool numa_off;
54 s8 acpi_numa = 0;
55
srat_disabled(void)56 int srat_disabled(void)
57 {
58 return numa_off || acpi_numa < 0;
59 }
60
61 /*
62 * Given a shift value, try to populate memnodemap[]
63 * Returns :
64 * 1 if OK
65 * 0 if memnodmap[] too small (of shift too small)
66 * -1 if node overlap or lost ram (shift too big)
67 */
populate_memnodemap(const struct node * nodes,int numnodes,int shift,nodeid_t * nodeids)68 static int __init populate_memnodemap(const struct node *nodes,
69 int numnodes, int shift, nodeid_t *nodeids)
70 {
71 unsigned long spdx, epdx;
72 int i, res = -1;
73
74 memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
75 for ( i = 0; i < numnodes; i++ )
76 {
77 spdx = paddr_to_pdx(nodes[i].start);
78 epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
79 if ( spdx >= epdx )
80 continue;
81 if ( (epdx >> shift) >= memnodemapsize )
82 return 0;
83 do {
84 if ( memnodemap[spdx >> shift] != NUMA_NO_NODE )
85 return -1;
86
87 if ( !nodeids )
88 memnodemap[spdx >> shift] = i;
89 else
90 memnodemap[spdx >> shift] = nodeids[i];
91
92 spdx += (1UL << shift);
93 } while ( spdx < epdx );
94 res = 1;
95 }
96
97 return res;
98 }
99
allocate_cachealigned_memnodemap(void)100 static int __init allocate_cachealigned_memnodemap(void)
101 {
102 unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
103 unsigned long mfn = mfn_x(alloc_boot_pages(size, 1));
104
105 memnodemap = mfn_to_virt(mfn);
106 mfn <<= PAGE_SHIFT;
107 size <<= PAGE_SHIFT;
108 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
109 mfn, mfn + size);
110 memnodemapsize = size / sizeof(*memnodemap);
111
112 return 0;
113 }
114
115 /*
116 * The LSB of all start and end addresses in the node map is the value of the
117 * maximum possible shift.
118 */
extract_lsb_from_nodes(const struct node * nodes,int numnodes)119 static int __init extract_lsb_from_nodes(const struct node *nodes,
120 int numnodes)
121 {
122 int i, nodes_used = 0;
123 unsigned long spdx, epdx;
124 unsigned long bitfield = 0, memtop = 0;
125
126 for ( i = 0; i < numnodes; i++ )
127 {
128 spdx = paddr_to_pdx(nodes[i].start);
129 epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
130 if ( spdx >= epdx )
131 continue;
132 bitfield |= spdx;
133 nodes_used++;
134 if ( epdx > memtop )
135 memtop = epdx;
136 }
137 if ( nodes_used <= 1 )
138 i = BITS_PER_LONG - 1;
139 else
140 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
141 memnodemapsize = (memtop >> i) + 1;
142 return i;
143 }
144
compute_hash_shift(struct node * nodes,int numnodes,nodeid_t * nodeids)145 int __init compute_hash_shift(struct node *nodes, int numnodes,
146 nodeid_t *nodeids)
147 {
148 int shift;
149
150 shift = extract_lsb_from_nodes(nodes, numnodes);
151 if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
152 memnodemap = _memnodemap;
153 else if ( allocate_cachealigned_memnodemap() )
154 return -1;
155 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift);
156
157 if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 )
158 {
159 printk(KERN_INFO "Your memory is not aligned you need to "
160 "rebuild your hypervisor with a bigger NODEMAPSIZE "
161 "shift=%d\n", shift);
162 return -1;
163 }
164
165 return shift;
166 }
167 /* initialize NODE_DATA given nodeid and start/end */
setup_node_bootmem(nodeid_t nodeid,u64 start,u64 end)168 void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
169 {
170 unsigned long start_pfn, end_pfn;
171
172 start_pfn = start >> PAGE_SHIFT;
173 end_pfn = end >> PAGE_SHIFT;
174
175 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
176 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
177
178 node_set_online(nodeid);
179 }
180
numa_init_array(void)181 void __init numa_init_array(void)
182 {
183 int rr, i;
184
185 /* There are unfortunately some poorly designed mainboards around
186 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
187 mapping. To avoid this fill in the mapping for all possible
188 CPUs, as the number of CPUs is not known yet.
189 We round robin the existing nodes. */
190 rr = first_node(node_online_map);
191 for ( i = 0; i < nr_cpu_ids; i++ )
192 {
193 if ( cpu_to_node[i] != NUMA_NO_NODE )
194 continue;
195 numa_set_node(i, rr);
196 rr = cycle_node(rr, node_online_map);
197 }
198 }
199
200 #ifdef CONFIG_NUMA_EMU
201 static int numa_fake __initdata = 0;
202
203 /* Numa emulation */
numa_emulation(u64 start_pfn,u64 end_pfn)204 static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
205 {
206 int i;
207 struct node nodes[MAX_NUMNODES];
208 u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
209
210 /* Kludge needed for the hash function */
211 if ( hweight64(sz) > 1 )
212 {
213 u64 x = 1;
214 while ( (x << 1) < sz )
215 x <<= 1;
216 if ( x < sz/2 )
217 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
218 sz = x;
219 }
220
221 memset(&nodes,0,sizeof(nodes));
222 for ( i = 0; i < numa_fake; i++ )
223 {
224 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
225 if ( i == numa_fake - 1 )
226 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
227 nodes[i].end = nodes[i].start + sz;
228 printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
229 i,
230 nodes[i].start, nodes[i].end,
231 (nodes[i].end - nodes[i].start) >> 20);
232 node_set_online(i);
233 }
234 memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
235 if ( memnode_shift < 0 )
236 {
237 memnode_shift = 0;
238 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
239 return -1;
240 }
241 for_each_online_node ( i )
242 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
243 numa_init_array();
244
245 return 0;
246 }
247 #endif
248
numa_initmem_init(unsigned long start_pfn,unsigned long end_pfn)249 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
250 {
251 int i;
252
253 #ifdef CONFIG_NUMA_EMU
254 if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
255 return;
256 #endif
257
258 #ifdef CONFIG_ACPI_NUMA
259 if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
260 (u64)end_pfn << PAGE_SHIFT) )
261 return;
262 #endif
263
264 printk(KERN_INFO "%s\n",
265 numa_off ? "NUMA turned off" : "No NUMA configuration found");
266
267 printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
268 (u64)start_pfn << PAGE_SHIFT,
269 (u64)end_pfn << PAGE_SHIFT);
270 /* setup dummy node covering all memory */
271 memnode_shift = BITS_PER_LONG - 1;
272 memnodemap = _memnodemap;
273 nodes_clear(node_online_map);
274 node_set_online(0);
275 for ( i = 0; i < nr_cpu_ids; i++ )
276 numa_set_node(i, 0);
277 cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
278 setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
279 (u64)end_pfn << PAGE_SHIFT);
280 }
281
numa_add_cpu(int cpu)282 void numa_add_cpu(int cpu)
283 {
284 cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
285 }
286
numa_set_node(int cpu,nodeid_t node)287 void numa_set_node(int cpu, nodeid_t node)
288 {
289 cpu_to_node[cpu] = node;
290 }
291
292 /* [numa=off] */
numa_setup(const char * opt)293 static __init int numa_setup(const char *opt)
294 {
295 if ( !strncmp(opt,"off",3) )
296 numa_off = true;
297 else if ( !strncmp(opt,"on",2) )
298 numa_off = false;
299 #ifdef CONFIG_NUMA_EMU
300 else if ( !strncmp(opt, "fake=", 5) )
301 {
302 numa_off = false;
303 numa_fake = simple_strtoul(opt+5,NULL,0);
304 if ( numa_fake >= MAX_NUMNODES )
305 numa_fake = MAX_NUMNODES;
306 }
307 #endif
308 #ifdef CONFIG_ACPI_NUMA
309 else if ( !strncmp(opt,"noacpi",6) )
310 {
311 numa_off = false;
312 acpi_numa = -1;
313 }
314 #endif
315 else
316 return -EINVAL;
317
318 return 0;
319 }
320
321 /*
322 * Setup early cpu_to_node.
323 *
324 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
325 * and apicid_to_node[] tables have valid entries for a CPU.
326 * This means we skip cpu_to_node[] initialisation for NUMA
327 * emulation and faking node case (when running a kernel compiled
328 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
329 * is already initialized in a round robin manner at numa_init_array,
330 * prior to this call, and this initialization is good enough
331 * for the fake NUMA cases.
332 */
init_cpu_to_node(void)333 void __init init_cpu_to_node(void)
334 {
335 unsigned int i;
336 nodeid_t node;
337
338 for ( i = 0; i < nr_cpu_ids; i++ )
339 {
340 u32 apicid = x86_cpu_to_apicid[i];
341 if ( apicid == BAD_APICID )
342 continue;
343 node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
344 if ( node == NUMA_NO_NODE || !node_online(node) )
345 node = 0;
346 numa_set_node(i, node);
347 }
348 }
349
arch_get_dma_bitsize(void)350 unsigned int __init arch_get_dma_bitsize(void)
351 {
352 unsigned int node;
353
354 for_each_online_node(node)
355 if ( node_spanned_pages(node) &&
356 !(node_start_pfn(node) >> (32 - PAGE_SHIFT)) )
357 break;
358 if ( node >= MAX_NUMNODES )
359 panic("No node with memory below 4Gb\n");
360
361 /*
362 * Try to not reserve the whole node's memory for DMA, but dividing
363 * its spanned pages by (arbitrarily chosen) 4.
364 */
365 return min_t(unsigned int,
366 flsl(node_start_pfn(node) + node_spanned_pages(node) / 4 - 1)
367 + PAGE_SHIFT, 32);
368 }
369
dump_numa(unsigned char key)370 static void dump_numa(unsigned char key)
371 {
372 s_time_t now = NOW();
373 unsigned int i, j, n;
374 struct domain *d;
375 struct page_info *page;
376 unsigned int page_num_node[MAX_NUMNODES];
377 const struct vnuma_info *vnuma;
378
379 printk("'%c' pressed -> dumping numa info (now = %"PRI_stime")\n", key,
380 now);
381
382 for_each_online_node ( i )
383 {
384 paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
385
386 printk("NODE%u start->%lu size->%lu free->%lu\n",
387 i, node_start_pfn(i), node_spanned_pages(i),
388 avail_node_heap_pages(i));
389 /* sanity check phys_to_nid() */
390 if ( phys_to_nid(pa) != i )
391 printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n",
392 pa, phys_to_nid(pa), i);
393 }
394
395 j = cpumask_first(&cpu_online_map);
396 n = 0;
397 for_each_online_cpu ( i )
398 {
399 if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] )
400 {
401 if ( n > 1 )
402 printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
403 else
404 printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
405 j = i;
406 n = 1;
407 }
408 else
409 ++n;
410 }
411 if ( n > 1 )
412 printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
413 else
414 printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
415
416 rcu_read_lock(&domlist_read_lock);
417
418 printk("Memory location of each domain:\n");
419 for_each_domain ( d )
420 {
421 process_pending_softirqs();
422
423 printk("Domain %u (total: %u):\n", d->domain_id, domain_tot_pages(d));
424
425 for_each_online_node ( i )
426 page_num_node[i] = 0;
427
428 spin_lock(&d->page_alloc_lock);
429 page_list_for_each(page, &d->page_list)
430 {
431 i = phys_to_nid(page_to_maddr(page));
432 page_num_node[i]++;
433 }
434 spin_unlock(&d->page_alloc_lock);
435
436 for_each_online_node ( i )
437 printk(" Node %u: %u\n", i, page_num_node[i]);
438
439 if ( !read_trylock(&d->vnuma_rwlock) )
440 continue;
441
442 if ( !d->vnuma )
443 {
444 read_unlock(&d->vnuma_rwlock);
445 continue;
446 }
447
448 vnuma = d->vnuma;
449 printk(" %u vnodes, %u vcpus, guest physical layout:\n",
450 vnuma->nr_vnodes, d->max_vcpus);
451 for ( i = 0; i < vnuma->nr_vnodes; i++ )
452 {
453 unsigned int start_cpu = ~0U;
454
455 if ( vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
456 printk(" %3u: pnode ???,", i);
457 else
458 printk(" %3u: pnode %3u,", i, vnuma->vnode_to_pnode[i]);
459
460 printk(" vcpus ");
461
462 for ( j = 0; j < d->max_vcpus; j++ )
463 {
464 if ( !(j & 0x3f) )
465 process_pending_softirqs();
466
467 if ( vnuma->vcpu_to_vnode[j] == i )
468 {
469 if ( start_cpu == ~0U )
470 {
471 printk("%d", j);
472 start_cpu = j;
473 }
474 }
475 else if ( start_cpu != ~0U )
476 {
477 if ( j - 1 != start_cpu )
478 printk("-%d ", j - 1);
479 else
480 printk(" ");
481 start_cpu = ~0U;
482 }
483 }
484
485 if ( start_cpu != ~0U && start_cpu != j - 1 )
486 printk("-%d", j - 1);
487
488 printk("\n");
489
490 for ( j = 0; j < vnuma->nr_vmemranges; j++ )
491 {
492 if ( vnuma->vmemrange[j].nid == i )
493 printk(" %016"PRIx64" - %016"PRIx64"\n",
494 vnuma->vmemrange[j].start,
495 vnuma->vmemrange[j].end);
496 }
497 }
498
499 read_unlock(&d->vnuma_rwlock);
500 }
501
502 rcu_read_unlock(&domlist_read_lock);
503 }
504
register_numa_trigger(void)505 static __init int register_numa_trigger(void)
506 {
507 register_keyhandler('u', dump_numa, "dump NUMA info", 1);
508 return 0;
509 }
510 __initcall(register_numa_trigger);
511
512