1 /*
2  * Copyright (C) 2012      Citrix Ltd.
3  * Author Dario Faggioli <dario.faggioli@citrix.com>
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published
7  * by the Free Software Foundation; version 2.1 only. with the special
8  * exception on linking described in file LICENSE.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public License for more details.
14  */
15 
16 #include "libxl_osdeps.h" /* must come before any other headers */
17 
18 #include <glob.h>
19 
20 #include "libxl_internal.h"
21 
22 /*
23  * What follows are helpers for generating all the k-combinations
24  * without repetitions of a set S with n elements in it. Formally
25  * speaking, they are subsets of k distinct elements of S and, if
26  * S is n elements big, the number of k-combinations is equal to the
27  * binomial coefficient C(n k)=n!/(k! * (n - k)!).
28  *
29  * The various subset are generated one after the other by calling
30  * comb_init() first, and, after that, comb_next()
31  * C(n k)-1 times. An iterator is used to store the current status
32  * of the whole generation operation (i.e., basically, the last
33  * combination that has been generated). As soon as all combinations
34  * have been generated, comb_next() will start returning 0 instead of
35  * 1. The same instance of the iterator and the same values for
36  * n and k _must_ be used for each call (if that doesn't happen, the
37  * result is unspecified).
38  *
39  * The algorithm is a well known one (see, for example, D. Knuth's "The
40  * Art of Computer Programming - Volume 4, Fascicle 3" and it produces
41  * the combinations in such a way that they (well, more precisely, their
42  * indexes it the array/map representing the set) come with lexicographic
43  * ordering.
44  *
45  * For example, with n = 5 and k = 3, calling comb_init()
46  * will generate { 0, 1, 2 }, while subsequent valid calls to
47  * comb_next() will produce the following:
48  * { { 0, 1, 3 }, { 0, 1, 4 },
49  *   { 0, 2, 3 }, { 0, 2, 4 }, { 0, 3, 4 },
50  *   { 1, 2, 3 }, { 1, 2, 4 }, { 1, 3, 4 },
51  *   { 2, 3, 4 } }
52  *
53  * This is used by the automatic NUMA placement logic below.
54  */
55 typedef int* comb_iter_t;
56 
comb_init(libxl__gc * gc,comb_iter_t * it,int n,int k)57 static int comb_init(libxl__gc *gc, comb_iter_t *it, int n, int k)
58 {
59     comb_iter_t new_iter;
60     int i;
61 
62     if (n < k)
63         return 0;
64 
65     /* First set is always { 0, 1, 2, ..., k-1 } */
66     GCNEW_ARRAY(new_iter, k);
67     for (i = 0; i < k; i++)
68         new_iter[i] = i;
69 
70     *it = new_iter;
71     return 1;
72 }
73 
comb_next(comb_iter_t it,int n,int k)74 static int comb_next(comb_iter_t it, int n, int k)
75 {
76     int i;
77 
78     /*
79      * The idea here is to find the leftmost element from where
80      * we should start incrementing the indexes of the iterator.
81      * This means looking for the highest index that can be increased
82      * while still producing value smaller than n-1. In the example
83      * above, when dealing with { 0, 1, 4 }, such an element is the
84      * second one, as the third is already equal to 4 (which actually
85      * is n-1).
86      * Once we found from where to start, we increment that element
87      * and override the right-hand rest of the iterator with its
88      * successors, thus achieving lexicographic ordering.
89      *
90      * Regarding the termination of the generation process, when we
91      * manage in bringing n-k at the very first position of the iterator,
92      * we know that is the last valid combination ( { 2, 3, 4 }, with
93      * n - k = 5 - 2 = 2, in the example above), and thus we start
94      * returning 0 as soon as we cross that border.
95      */
96     for (i = k - 1; it[i] == n - k + i; i--) {
97         if (i <= 0)
98             return 0;
99     }
100     for (it[i]++, i++; i < k; i++)
101         it[i] = it[i - 1] + 1;
102     return 1;
103 }
104 
105 /* NUMA automatic placement (see libxl_internal.h for details) */
106 
107 /*
108  * This function turns a k-combination iterator into a node map,
109  * given another map, telling us which nodes should be considered.
110  *
111  * This means the bits that are set in suitable_nodemap and that
112  * corresponds to the indexes of the given combination are the ones
113  * that will be set in nodemap.
114  *
115  * For example, given a fully set suitable_nodemap, if the iterator
116  * represents the combination { 0, 2, 4}, nodmeap will have bits #0,
117  * #2 and #4 set.
118  * On the other hand, if, say,  suitable_nodemap=01011011, the same
119  * iterator will cause bits #1, #4 and #7 of nodemap to be set.
120  */
comb_get_nodemap(comb_iter_t it,libxl_bitmap * suitable_nodemap,libxl_bitmap * nodemap,int k)121 static void comb_get_nodemap(comb_iter_t it, libxl_bitmap *suitable_nodemap,
122                              libxl_bitmap *nodemap, int k)
123 {
124     int i, m = 0, n = 0;
125 
126     libxl_bitmap_set_none(nodemap);
127     libxl_for_each_set_bit(i, *suitable_nodemap) {
128         /* Check wether the n-th set bit of suitable_nodemap
129          * matches with the m-th element of the iterator (and,
130          * only if it does, advance to the next one) */
131         if (m < k && n == it[m]) {
132             libxl_bitmap_set(nodemap, i);
133             m++;
134         }
135         n++;
136     }
137 }
138 
139 /* Retrieve the number of cpus that the nodes that are part of the nodemap
140  * span and are also set in suitable_cpumap. */
nodemap_to_nr_cpus(libxl_cputopology * tinfo,int nr_cpus,const libxl_bitmap * suitable_cpumap,const libxl_bitmap * nodemap)141 static int nodemap_to_nr_cpus(libxl_cputopology *tinfo, int nr_cpus,
142                               const libxl_bitmap *suitable_cpumap,
143                               const libxl_bitmap *nodemap)
144 {
145     int i, nodes_cpus = 0;
146 
147     for (i = 0; i < nr_cpus; i++) {
148         if (libxl_bitmap_test(suitable_cpumap, i) &&
149             libxl_bitmap_test(nodemap, tinfo[i].node))
150             nodes_cpus++;
151     }
152     return nodes_cpus;
153 }
154 
155 /* Retrieve the amount of free memory within the nodemap */
nodemap_to_free_memkb(libxl_numainfo * ninfo,libxl_bitmap * nodemap)156 static uint32_t nodemap_to_free_memkb(libxl_numainfo *ninfo,
157                                       libxl_bitmap *nodemap)
158 {
159     uint32_t free_memkb = 0;
160     int i;
161 
162     libxl_for_each_set_bit(i, *nodemap)
163         free_memkb += ninfo[i].free / 1024;
164 
165     return free_memkb;
166 }
167 
168 /* Retrieve the number of vcpus able to run on the nodes in nodemap */
nodemap_to_nr_vcpus(libxl__gc * gc,int vcpus_on_node[],const libxl_bitmap * nodemap)169 static int nodemap_to_nr_vcpus(libxl__gc *gc, int vcpus_on_node[],
170                                const libxl_bitmap *nodemap)
171 {
172     int i, nr_vcpus = 0;
173 
174     libxl_for_each_set_bit(i, *nodemap)
175         nr_vcpus += vcpus_on_node[i];
176 
177     return nr_vcpus;
178 }
179 
180 /* Number of vcpus able to run on the cpus of the various nodes
181  * (reported by filling the array vcpus_on_node[]). */
nr_vcpus_on_nodes(libxl__gc * gc,libxl_cputopology * tinfo,size_t tinfo_elements,const libxl_bitmap * suitable_cpumap,int vcpus_on_node[])182 static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo,
183                              size_t tinfo_elements,
184                              const libxl_bitmap *suitable_cpumap,
185                              int vcpus_on_node[])
186 {
187     libxl_dominfo *dinfo = NULL;
188     libxl_bitmap dom_nodemap, nodes_counted;
189     int nr_doms, nr_cpus;
190     int i, j, k;
191 
192     dinfo = libxl_list_domain(CTX, &nr_doms);
193     if (dinfo == NULL)
194         return ERROR_FAIL;
195 
196     if (libxl_node_bitmap_alloc(CTX, &nodes_counted, 0) < 0) {
197         libxl_dominfo_list_free(dinfo, nr_doms);
198         return ERROR_FAIL;
199     }
200 
201     if (libxl_node_bitmap_alloc(CTX, &dom_nodemap, 0) < 0) {
202         libxl_bitmap_dispose(&nodes_counted);
203         libxl_dominfo_list_free(dinfo, nr_doms);
204         return ERROR_FAIL;
205     }
206 
207     for (i = 0; i < nr_doms; i++) {
208         libxl_vcpuinfo *vinfo = NULL;
209         int nr_dom_vcpus = 0;
210         libxl_cpupoolinfo cpupool_info;
211         int cpupool;
212 
213         libxl_cpupoolinfo_init(&cpupool_info);
214 
215         cpupool = libxl__domain_cpupool(gc, dinfo[i].domid);
216         if (cpupool < 0)
217             goto next;
218         if (libxl_cpupool_info(CTX, &cpupool_info, cpupool))
219             goto next;
220 
221         vinfo = libxl_list_vcpu(CTX, dinfo[i].domid, &nr_dom_vcpus, &nr_cpus);
222         if (vinfo == NULL)
223             goto next;
224 
225         /* Retrieve the domain's node-affinity map */
226         libxl_domain_get_nodeaffinity(CTX, dinfo[i].domid, &dom_nodemap);
227 
228         for (j = 0; j < nr_dom_vcpus; j++) {
229             /*
230              * For each vcpu of each domain, it must have both vcpu-affinity
231              * and node-affinity to (a pcpu belonging to) a certain node to
232              * cause an increment in the corresponding element of the array.
233              *
234              * Note that we also need to check whether the cpu actually
235              * belongs to the domain's cpupool (the cpupool of the domain
236              * being checked). In fact, it could be that the vcpu has affinity
237              * with cpus in suitable_cpumask, but that are not in its own
238              * cpupool, and we don't want to consider those!
239              */
240             libxl_bitmap_set_none(&nodes_counted);
241             libxl_for_each_set_bit(k, vinfo[j].cpumap) {
242                 if (k >= tinfo_elements)
243                     break;
244                 int node = tinfo[k].node;
245 
246                 if (libxl_bitmap_test(suitable_cpumap, k) &&
247                     libxl_bitmap_test(&cpupool_info.cpumap, k) &&
248                     libxl_bitmap_test(&dom_nodemap, node) &&
249                     !libxl_bitmap_test(&nodes_counted, node)) {
250                     libxl_bitmap_set(&nodes_counted, node);
251                     vcpus_on_node[node]++;
252                 }
253             }
254         }
255 
256  next:
257         libxl_cpupoolinfo_dispose(&cpupool_info);
258         libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus);
259     }
260 
261     libxl_bitmap_dispose(&dom_nodemap);
262     libxl_bitmap_dispose(&nodes_counted);
263     libxl_dominfo_list_free(dinfo, nr_doms);
264     return 0;
265 }
266 
267 /*
268  * This function tries to figure out if the host has a consistent number
269  * of cpus along all its NUMA nodes. In fact, if that is the case, we can
270  * calculate the minimum number of nodes needed for a domain by just
271  * dividing its total number of vcpus by this value computed here.
272  * However, we are not allowed to assume that all the nodes have the
273  * same number of cpus. Therefore, in case discrepancies among different
274  * nodes are found, this function just returns 0, for the caller to know
275  * it shouldn't rely on this 'optimization', and sort out things in some
276  * other way (by doing something basic, like starting trying with
277  * candidates with just one node).
278  */
count_cpus_per_node(libxl_cputopology * tinfo,int nr_cpus,int nr_nodes)279 static int count_cpus_per_node(libxl_cputopology *tinfo, int nr_cpus,
280                                int nr_nodes)
281 {
282     int cpus_per_node = 0;
283     int j, i;
284 
285     /* This makes sense iff # of PCPUs is the same for all nodes */
286     for (j = 0; j < nr_nodes; j++) {
287         int curr_cpus = 0;
288 
289         for (i = 0; i < nr_cpus; i++) {
290             if (tinfo[i].node == j)
291                 curr_cpus++;
292         }
293         /* So, if the above does not hold, turn the whole thing off! */
294         cpus_per_node = cpus_per_node == 0 ? curr_cpus : cpus_per_node;
295         if (cpus_per_node != curr_cpus)
296             return 0;
297     }
298     return cpus_per_node;
299 }
300 
301 /*
302  * Looks for the placement candidates that satisfyies some specific
303  * conditions and return the best one according to the provided
304  * comparison function.
305  */
libxl__get_numa_candidate(libxl__gc * gc,uint64_t min_free_memkb,int min_cpus,int min_nodes,int max_nodes,const libxl_bitmap * suitable_cpumap,libxl__numa_candidate_cmpf numa_cmpf,libxl__numa_candidate * cndt_out,int * cndt_found)306 int libxl__get_numa_candidate(libxl__gc *gc,
307                               uint64_t min_free_memkb, int min_cpus,
308                               int min_nodes, int max_nodes,
309                               const libxl_bitmap *suitable_cpumap,
310                               libxl__numa_candidate_cmpf numa_cmpf,
311                               libxl__numa_candidate *cndt_out,
312                               int *cndt_found)
313 {
314     libxl__numa_candidate new_cndt;
315     libxl_cputopology *tinfo = NULL;
316     libxl_numainfo *ninfo = NULL;
317     int nr_nodes = 0, nr_suit_nodes, nr_cpus = 0;
318     libxl_bitmap suitable_nodemap, nodemap;
319     int *vcpus_on_node, rc = 0;
320 
321     libxl_bitmap_init(&nodemap);
322     libxl_bitmap_init(&suitable_nodemap);
323     libxl__numa_candidate_init(&new_cndt);
324 
325     /* Get platform info and prepare the map for testing the combinations */
326     ninfo = libxl_get_numainfo(CTX, &nr_nodes);
327     if (ninfo == NULL)
328         return ERROR_FAIL;
329 
330     if (nr_nodes <= 1) {
331         *cndt_found = 0;
332         goto out;
333     }
334 
335     GCNEW_ARRAY(vcpus_on_node, nr_nodes);
336 
337     /*
338      * The good thing about this solution is that it is based on heuristics
339      * (implemented in numa_cmpf() ), but we at least can evaluate it on
340      * all the possible placement candidates. That can happen because the
341      * number of nodes present in current NUMA systems is quite small.
342      * In fact, even if a sum of binomials is involved, if the system has
343      * up to 16 nodes it "only" takes 65535 steps. This is fine, as the
344      * number of nodes the biggest NUMA systems provide at the time of this
345      * writing is 8 (and it will probably continue to be so for a while).
346      * However, computanional complexity would explode on systems bigger
347      * than that, and it's really important we avoid trying to run this
348      * on monsters with 32, 64 or more nodes (if they ever pop into being).
349      * Therefore, here it comes a safety catch that disables the algorithm
350      * for the cases when it wouldn't work well.
351      */
352     if (nr_nodes > 16) {
353         /* Log we did nothing and return 0, as no real error occurred */
354         LOG(WARN, "System has %d NUMA nodes, which is too big for the "
355                   "placement algorithm to work effectively: skipping it. "
356                   "Consider manually pinning the vCPUs and/or looking at "
357                   "cpupools for manually partitioning the system.",
358                   nr_nodes);
359         *cndt_found = 0;
360         goto out;
361     }
362 
363     tinfo = libxl_get_cpu_topology(CTX, &nr_cpus);
364     if (tinfo == NULL) {
365         rc = ERROR_FAIL;
366         goto out;
367     }
368 
369     rc = libxl_node_bitmap_alloc(CTX, &nodemap, 0);
370     if (rc)
371         goto out;
372     rc = libxl__numa_candidate_alloc(gc, &new_cndt);
373     if (rc)
374         goto out;
375 
376     /* Allocate and prepare the map of the node that can be utilized for
377      * placement, basing on the map of suitable cpus. */
378     rc = libxl_node_bitmap_alloc(CTX, &suitable_nodemap, 0);
379     if (rc)
380         goto out;
381     rc = libxl_cpumap_to_nodemap(CTX, suitable_cpumap, &suitable_nodemap);
382     if (rc)
383         goto out;
384 
385     /*
386      * Later on, we will try to figure out how many vcpus are runnable on
387      * each candidate (as a part of choosing the best one of them). That
388      * requires going through all the vcpus of all the domains and check
389      * their affinities. So, instead of doing that for each candidate,
390      * let's count here the number of vcpus runnable on each node, so that
391      * all we have to do later is summing up the right elements of the
392      * vcpus_on_node array.
393      */
394     rc = nr_vcpus_on_nodes(gc, tinfo, nr_cpus, suitable_cpumap, vcpus_on_node);
395     if (rc)
396         goto out;
397 
398     /*
399      * If the minimum number of NUMA nodes is not explicitly specified
400      * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes
401      * from where to start generating candidates, if possible (or just start
402      * from 1 otherwise). The maximum number of nodes should not exceed the
403      * number of existent NUMA nodes on the host, or the candidate generation
404      * won't work properly.
405      */
406     if (!min_nodes) {
407         int cpus_per_node;
408 
409         cpus_per_node = count_cpus_per_node(tinfo, nr_cpus, nr_nodes);
410         if (cpus_per_node == 0)
411             min_nodes = 1;
412         else
413             min_nodes = (min_cpus + cpus_per_node - 1) / cpus_per_node;
414     }
415     /* We also need to be sure we do not exceed the number of
416      * nodes we are allowed to use. */
417     nr_suit_nodes = libxl_bitmap_count_set(&suitable_nodemap);
418 
419     if (min_nodes > nr_suit_nodes)
420         min_nodes = nr_suit_nodes;
421     if (!max_nodes || max_nodes > nr_suit_nodes)
422         max_nodes = nr_suit_nodes;
423     if (min_nodes > max_nodes) {
424         LOG(ERROR, "Inconsistent minimum or maximum number of guest nodes");
425         rc = ERROR_INVAL;
426         goto out;
427     }
428 
429     /* This is up to the caller to be disposed */
430     rc = libxl__numa_candidate_alloc(gc, cndt_out);
431     if (rc)
432         goto out;
433 
434     /*
435      * Consider all the combinations with sizes in [min_nodes, max_nodes]
436      * (see comb_init() and comb_next()). Note that, since the fewer the
437      * number of nodes the better, it is guaranteed that any candidate
438      * found during the i-eth step will be better than any other one we
439      * could find during the (i+1)-eth and all the subsequent steps (they
440      * all will have more nodes). It's thus pointless to keep going if
441      * we already found something.
442      */
443     *cndt_found = 0;
444     while (min_nodes <= max_nodes && *cndt_found == 0) {
445         comb_iter_t comb_iter;
446         int comb_ok;
447 
448         /*
449          * And here it is. Each step of this cycle generates a combination of
450          * nodes as big as min_nodes mandates.  Each of these combinations is
451          * checked against the constraints provided by the caller (namely,
452          * amount of free memory and number of cpus) and it can concur to
453          * become our best placement iff it passes the check.
454          */
455         for (comb_ok = comb_init(gc, &comb_iter, nr_suit_nodes, min_nodes);
456              comb_ok;
457              comb_ok = comb_next(comb_iter, nr_suit_nodes, min_nodes)) {
458             uint64_t nodes_free_memkb;
459             int nodes_cpus;
460 
461             /* Get the nodemap for the combination, only considering
462              * suitable nodes. */
463             comb_get_nodemap(comb_iter, &suitable_nodemap,
464                              &nodemap, min_nodes);
465 
466             /* If there is not enough memory in this combination, skip it
467              * and go generating the next one... */
468             nodes_free_memkb = nodemap_to_free_memkb(ninfo, &nodemap);
469             if (min_free_memkb && nodes_free_memkb < min_free_memkb)
470                 continue;
471 
472             /* And the same applies if this combination is short in cpus */
473             nodes_cpus = nodemap_to_nr_cpus(tinfo, nr_cpus, suitable_cpumap,
474                                             &nodemap);
475             if (min_cpus && nodes_cpus < min_cpus)
476                 continue;
477 
478             /*
479              * Conditions are met, we can compare this candidate with the
480              * current best one (if any).
481              */
482             libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap);
483             new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, vcpus_on_node,
484                                                     &nodemap);
485             new_cndt.free_memkb = nodes_free_memkb;
486             new_cndt.nr_nodes = libxl_bitmap_count_set(&nodemap);
487             new_cndt.nr_cpus = nodes_cpus;
488 
489             /*
490              * Check if the new candidate we is better the what we found up
491              * to now by means of the comparison function. If no comparison
492              * function is provided, just return as soon as we find our first
493              * candidate.
494              */
495             if (*cndt_found == 0 || numa_cmpf(&new_cndt, cndt_out) < 0) {
496                 *cndt_found = 1;
497 
498                 LOG(DEBUG, "New best NUMA placement candidate found: "
499                            "nr_nodes=%d, nr_cpus=%d, nr_vcpus=%d, "
500                            "free_memkb=%"PRIu64"", new_cndt.nr_nodes,
501                            new_cndt.nr_cpus, new_cndt.nr_vcpus,
502                            new_cndt.free_memkb / 1024);
503 
504                 libxl__numa_candidate_put_nodemap(gc, cndt_out, &nodemap);
505                 cndt_out->nr_vcpus = new_cndt.nr_vcpus;
506                 cndt_out->free_memkb = new_cndt.free_memkb;
507                 cndt_out->nr_nodes = new_cndt.nr_nodes;
508                 cndt_out->nr_cpus = new_cndt.nr_cpus;
509 
510                 if (numa_cmpf == NULL)
511                     break;
512             }
513         }
514         min_nodes++;
515     }
516 
517     if (*cndt_found == 0)
518         LOG(NOTICE, "NUMA placement failed, performance might be affected");
519 
520  out:
521     libxl_bitmap_dispose(&nodemap);
522     libxl_bitmap_dispose(&suitable_nodemap);
523     libxl__numa_candidate_dispose(&new_cndt);
524     libxl_numainfo_list_free(ninfo, nr_nodes);
525     libxl_cputopology_list_free(tinfo, nr_cpus);
526     return rc;
527 }
528 
529 /*
530  * Local variables:
531  * mode: C
532  * c-basic-offset: 4
533  * indent-tabs-mode: nil
534  * End:
535  */
536