1 /*
2  * Generic VM initialization for x86-64 NUMA setups.
3  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4  * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
5  */
6 
7 #include <xen/mm.h>
8 #include <xen/string.h>
9 #include <xen/init.h>
10 #include <xen/ctype.h>
11 #include <xen/nodemask.h>
12 #include <xen/numa.h>
13 #include <xen/keyhandler.h>
14 #include <xen/param.h>
15 #include <xen/time.h>
16 #include <xen/smp.h>
17 #include <xen/pfn.h>
18 #include <asm/acpi.h>
19 #include <xen/sched.h>
20 #include <xen/softirq.h>
21 
22 static int numa_setup(const char *s);
23 custom_param("numa", numa_setup);
24 
25 #ifndef Dprintk
26 #define Dprintk(x...)
27 #endif
28 
29 /* from proto.h */
30 #define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
31 
32 struct node_data node_data[MAX_NUMNODES];
33 
34 /* Mapping from pdx to node id */
35 int memnode_shift;
36 static typeof(*memnodemap) _memnodemap[64];
37 unsigned long memnodemapsize;
38 u8 *memnodemap;
39 
40 nodeid_t cpu_to_node[NR_CPUS] __read_mostly = {
41     [0 ... NR_CPUS-1] = NUMA_NO_NODE
42 };
43 /*
44  * Keep BIOS's CPU2node information, should not be used for memory allocaion
45  */
46 nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
47     [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
48 };
49 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
50 
51 nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
52 
53 bool numa_off;
54 s8 acpi_numa = 0;
55 
srat_disabled(void)56 int srat_disabled(void)
57 {
58     return numa_off || acpi_numa < 0;
59 }
60 
61 /*
62  * Given a shift value, try to populate memnodemap[]
63  * Returns :
64  * 1 if OK
65  * 0 if memnodmap[] too small (of shift too small)
66  * -1 if node overlap or lost ram (shift too big)
67  */
populate_memnodemap(const struct node * nodes,int numnodes,int shift,nodeid_t * nodeids)68 static int __init populate_memnodemap(const struct node *nodes,
69                                       int numnodes, int shift, nodeid_t *nodeids)
70 {
71     unsigned long spdx, epdx;
72     int i, res = -1;
73 
74     memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
75     for ( i = 0; i < numnodes; i++ )
76     {
77         spdx = paddr_to_pdx(nodes[i].start);
78         epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
79         if ( spdx >= epdx )
80             continue;
81         if ( (epdx >> shift) >= memnodemapsize )
82             return 0;
83         do {
84             if ( memnodemap[spdx >> shift] != NUMA_NO_NODE )
85                 return -1;
86 
87             if ( !nodeids )
88                 memnodemap[spdx >> shift] = i;
89             else
90                 memnodemap[spdx >> shift] = nodeids[i];
91 
92             spdx += (1UL << shift);
93         } while ( spdx < epdx );
94         res = 1;
95     }
96 
97     return res;
98 }
99 
allocate_cachealigned_memnodemap(void)100 static int __init allocate_cachealigned_memnodemap(void)
101 {
102     unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
103     unsigned long mfn = mfn_x(alloc_boot_pages(size, 1));
104 
105     memnodemap = mfn_to_virt(mfn);
106     mfn <<= PAGE_SHIFT;
107     size <<= PAGE_SHIFT;
108     printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
109            mfn, mfn + size);
110     memnodemapsize = size / sizeof(*memnodemap);
111 
112     return 0;
113 }
114 
115 /*
116  * The LSB of all start and end addresses in the node map is the value of the
117  * maximum possible shift.
118  */
extract_lsb_from_nodes(const struct node * nodes,int numnodes)119 static int __init extract_lsb_from_nodes(const struct node *nodes,
120                                          int numnodes)
121 {
122     int i, nodes_used = 0;
123     unsigned long spdx, epdx;
124     unsigned long bitfield = 0, memtop = 0;
125 
126     for ( i = 0; i < numnodes; i++ )
127     {
128         spdx = paddr_to_pdx(nodes[i].start);
129         epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
130         if ( spdx >= epdx )
131             continue;
132         bitfield |= spdx;
133         nodes_used++;
134         if ( epdx > memtop )
135             memtop = epdx;
136     }
137     if ( nodes_used <= 1 )
138         i = BITS_PER_LONG - 1;
139     else
140         i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
141     memnodemapsize = (memtop >> i) + 1;
142     return i;
143 }
144 
compute_hash_shift(struct node * nodes,int numnodes,nodeid_t * nodeids)145 int __init compute_hash_shift(struct node *nodes, int numnodes,
146                               nodeid_t *nodeids)
147 {
148     int shift;
149 
150     shift = extract_lsb_from_nodes(nodes, numnodes);
151     if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
152         memnodemap = _memnodemap;
153     else if ( allocate_cachealigned_memnodemap() )
154         return -1;
155     printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift);
156 
157     if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 )
158     {
159         printk(KERN_INFO "Your memory is not aligned you need to "
160                "rebuild your hypervisor with a bigger NODEMAPSIZE "
161                "shift=%d\n", shift);
162         return -1;
163     }
164 
165     return shift;
166 }
167 /* initialize NODE_DATA given nodeid and start/end */
setup_node_bootmem(nodeid_t nodeid,u64 start,u64 end)168 void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
169 {
170     unsigned long start_pfn, end_pfn;
171 
172     start_pfn = start >> PAGE_SHIFT;
173     end_pfn = end >> PAGE_SHIFT;
174 
175     NODE_DATA(nodeid)->node_start_pfn = start_pfn;
176     NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
177 
178     node_set_online(nodeid);
179 }
180 
numa_init_array(void)181 void __init numa_init_array(void)
182 {
183     int rr, i;
184 
185     /* There are unfortunately some poorly designed mainboards around
186        that only connect memory to a single CPU. This breaks the 1:1 cpu->node
187        mapping. To avoid this fill in the mapping for all possible
188        CPUs, as the number of CPUs is not known yet.
189        We round robin the existing nodes. */
190     rr = first_node(node_online_map);
191     for ( i = 0; i < nr_cpu_ids; i++ )
192     {
193         if ( cpu_to_node[i] != NUMA_NO_NODE )
194             continue;
195         numa_set_node(i, rr);
196         rr = cycle_node(rr, node_online_map);
197     }
198 }
199 
200 #ifdef CONFIG_NUMA_EMU
201 static int numa_fake __initdata = 0;
202 
203 /* Numa emulation */
numa_emulation(u64 start_pfn,u64 end_pfn)204 static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
205 {
206     int i;
207     struct node nodes[MAX_NUMNODES];
208     u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
209 
210     /* Kludge needed for the hash function */
211     if ( hweight64(sz) > 1 )
212     {
213         u64 x = 1;
214         while ( (x << 1) < sz )
215             x <<= 1;
216         if ( x < sz/2 )
217             printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
218         sz = x;
219     }
220 
221     memset(&nodes,0,sizeof(nodes));
222     for ( i = 0; i < numa_fake; i++ )
223     {
224         nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
225         if ( i == numa_fake - 1 )
226             sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
227         nodes[i].end = nodes[i].start + sz;
228         printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
229                i,
230                nodes[i].start, nodes[i].end,
231                (nodes[i].end - nodes[i].start) >> 20);
232         node_set_online(i);
233     }
234     memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
235     if ( memnode_shift < 0 )
236     {
237         memnode_shift = 0;
238         printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
239         return -1;
240     }
241     for_each_online_node ( i )
242         setup_node_bootmem(i, nodes[i].start, nodes[i].end);
243     numa_init_array();
244 
245     return 0;
246 }
247 #endif
248 
numa_initmem_init(unsigned long start_pfn,unsigned long end_pfn)249 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
250 {
251     int i;
252 
253 #ifdef CONFIG_NUMA_EMU
254     if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
255         return;
256 #endif
257 
258 #ifdef CONFIG_ACPI_NUMA
259     if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
260          (u64)end_pfn << PAGE_SHIFT) )
261         return;
262 #endif
263 
264     printk(KERN_INFO "%s\n",
265            numa_off ? "NUMA turned off" : "No NUMA configuration found");
266 
267     printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
268            (u64)start_pfn << PAGE_SHIFT,
269            (u64)end_pfn << PAGE_SHIFT);
270     /* setup dummy node covering all memory */
271     memnode_shift = BITS_PER_LONG - 1;
272     memnodemap = _memnodemap;
273     nodes_clear(node_online_map);
274     node_set_online(0);
275     for ( i = 0; i < nr_cpu_ids; i++ )
276         numa_set_node(i, 0);
277     cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
278     setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
279                     (u64)end_pfn << PAGE_SHIFT);
280 }
281 
numa_add_cpu(int cpu)282 void numa_add_cpu(int cpu)
283 {
284     cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
285 }
286 
numa_set_node(int cpu,nodeid_t node)287 void numa_set_node(int cpu, nodeid_t node)
288 {
289     cpu_to_node[cpu] = node;
290 }
291 
292 /* [numa=off] */
numa_setup(const char * opt)293 static __init int numa_setup(const char *opt)
294 {
295     if ( !strncmp(opt,"off",3) )
296         numa_off = true;
297     else if ( !strncmp(opt,"on",2) )
298         numa_off = false;
299 #ifdef CONFIG_NUMA_EMU
300     else if ( !strncmp(opt, "fake=", 5) )
301     {
302         numa_off = false;
303         numa_fake = simple_strtoul(opt+5,NULL,0);
304         if ( numa_fake >= MAX_NUMNODES )
305             numa_fake = MAX_NUMNODES;
306     }
307 #endif
308 #ifdef CONFIG_ACPI_NUMA
309     else if ( !strncmp(opt,"noacpi",6) )
310     {
311         numa_off = false;
312         acpi_numa = -1;
313     }
314 #endif
315     else
316         return -EINVAL;
317 
318     return 0;
319 }
320 
321 /*
322  * Setup early cpu_to_node.
323  *
324  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
325  * and apicid_to_node[] tables have valid entries for a CPU.
326  * This means we skip cpu_to_node[] initialisation for NUMA
327  * emulation and faking node case (when running a kernel compiled
328  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
329  * is already initialized in a round robin manner at numa_init_array,
330  * prior to this call, and this initialization is good enough
331  * for the fake NUMA cases.
332  */
init_cpu_to_node(void)333 void __init init_cpu_to_node(void)
334 {
335     unsigned int i;
336     nodeid_t node;
337 
338     for ( i = 0; i < nr_cpu_ids; i++ )
339     {
340         u32 apicid = x86_cpu_to_apicid[i];
341         if ( apicid == BAD_APICID )
342             continue;
343         node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
344         if ( node == NUMA_NO_NODE || !node_online(node) )
345             node = 0;
346         numa_set_node(i, node);
347     }
348 }
349 
arch_get_dma_bitsize(void)350 unsigned int __init arch_get_dma_bitsize(void)
351 {
352     unsigned int node;
353 
354     for_each_online_node(node)
355         if ( node_spanned_pages(node) &&
356              !(node_start_pfn(node) >> (32 - PAGE_SHIFT)) )
357             break;
358     if ( node >= MAX_NUMNODES )
359         panic("No node with memory below 4Gb\n");
360 
361     /*
362      * Try to not reserve the whole node's memory for DMA, but dividing
363      * its spanned pages by (arbitrarily chosen) 4.
364      */
365     return min_t(unsigned int,
366                  flsl(node_start_pfn(node) + node_spanned_pages(node) / 4 - 1)
367                  + PAGE_SHIFT, 32);
368 }
369 
dump_numa(unsigned char key)370 static void dump_numa(unsigned char key)
371 {
372     s_time_t now = NOW();
373     unsigned int i, j, n;
374     struct domain *d;
375     struct page_info *page;
376     unsigned int page_num_node[MAX_NUMNODES];
377     const struct vnuma_info *vnuma;
378 
379     printk("'%c' pressed -> dumping numa info (now = %"PRI_stime")\n", key,
380            now);
381 
382     for_each_online_node ( i )
383     {
384         paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
385 
386         printk("NODE%u start->%lu size->%lu free->%lu\n",
387                i, node_start_pfn(i), node_spanned_pages(i),
388                avail_node_heap_pages(i));
389         /* sanity check phys_to_nid() */
390         if ( phys_to_nid(pa) != i )
391             printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n",
392                    pa, phys_to_nid(pa), i);
393     }
394 
395     j = cpumask_first(&cpu_online_map);
396     n = 0;
397     for_each_online_cpu ( i )
398     {
399         if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] )
400         {
401             if ( n > 1 )
402                 printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
403             else
404                 printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
405             j = i;
406             n = 1;
407         }
408         else
409             ++n;
410     }
411     if ( n > 1 )
412         printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
413     else
414         printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
415 
416     rcu_read_lock(&domlist_read_lock);
417 
418     printk("Memory location of each domain:\n");
419     for_each_domain ( d )
420     {
421         process_pending_softirqs();
422 
423         printk("Domain %u (total: %u):\n", d->domain_id, domain_tot_pages(d));
424 
425         for_each_online_node ( i )
426             page_num_node[i] = 0;
427 
428         spin_lock(&d->page_alloc_lock);
429         page_list_for_each(page, &d->page_list)
430         {
431             i = phys_to_nid(page_to_maddr(page));
432             page_num_node[i]++;
433         }
434         spin_unlock(&d->page_alloc_lock);
435 
436         for_each_online_node ( i )
437             printk("    Node %u: %u\n", i, page_num_node[i]);
438 
439         if ( !read_trylock(&d->vnuma_rwlock) )
440             continue;
441 
442         if ( !d->vnuma )
443         {
444             read_unlock(&d->vnuma_rwlock);
445             continue;
446         }
447 
448         vnuma = d->vnuma;
449         printk("     %u vnodes, %u vcpus, guest physical layout:\n",
450                vnuma->nr_vnodes, d->max_vcpus);
451         for ( i = 0; i < vnuma->nr_vnodes; i++ )
452         {
453             unsigned int start_cpu = ~0U;
454 
455             if ( vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
456                 printk("       %3u: pnode ???,", i);
457             else
458                 printk("       %3u: pnode %3u,", i, vnuma->vnode_to_pnode[i]);
459 
460             printk(" vcpus ");
461 
462             for ( j = 0; j < d->max_vcpus; j++ )
463             {
464                 if ( !(j & 0x3f) )
465                     process_pending_softirqs();
466 
467                 if ( vnuma->vcpu_to_vnode[j] == i )
468                 {
469                     if ( start_cpu == ~0U )
470                     {
471                         printk("%d", j);
472                         start_cpu = j;
473                     }
474                 }
475                 else if ( start_cpu != ~0U )
476                 {
477                     if ( j - 1 != start_cpu )
478                         printk("-%d ", j - 1);
479                     else
480                         printk(" ");
481                     start_cpu = ~0U;
482                 }
483             }
484 
485             if ( start_cpu != ~0U  && start_cpu != j - 1 )
486                 printk("-%d", j - 1);
487 
488             printk("\n");
489 
490             for ( j = 0; j < vnuma->nr_vmemranges; j++ )
491             {
492                 if ( vnuma->vmemrange[j].nid == i )
493                     printk("           %016"PRIx64" - %016"PRIx64"\n",
494                            vnuma->vmemrange[j].start,
495                            vnuma->vmemrange[j].end);
496             }
497         }
498 
499         read_unlock(&d->vnuma_rwlock);
500     }
501 
502     rcu_read_unlock(&domlist_read_lock);
503 }
504 
register_numa_trigger(void)505 static __init int register_numa_trigger(void)
506 {
507     register_keyhandler('u', dump_numa, "dump NUMA info", 1);
508     return 0;
509 }
510 __initcall(register_numa_trigger);
511 
512