1 /******************************************************************************
2  * page_alloc.c
3  *
4  * Simple buddy heap allocator for Xen.
5  *
6  * Copyright (c) 2002-2004 K A Fraser
7  * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; If not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 /*
24  * In general Xen maintains two pools of memory:
25  *
26  * - Xen heap: Memory which is always mapped (i.e accessible by
27  *             virtual address), via a permanent and contiguous
28  *             "direct mapping". Macros like va() and pa() are valid
29  *             for such memory and it is always permissible to stash
30  *             pointers to Xen heap memory in data structures etc.
31  *
32  *             Xen heap pages are always anonymous (that is, not tied
33  *             or accounted to any particular domain).
34  *
35  * - Dom heap: Memory which must be explicitly mapped, usually
36  *             transiently with map_domain_page(), in order to be
37  *             used. va() and pa() are not valid for such memory. Care
38  *             should be taken when stashing pointers to dom heap
39  *             pages that those mappings are permanent (e.g. vmap() or
40  *             map_domain_page_global()), it is not safe to stash
41  *             transient mappings such as those from map_domain_page()
42  *
43  *             Dom heap pages are often tied to a particular domain,
44  *             but need not be (passing domain==NULL results in an
45  *             anonymous dom heap allocation).
46  *
47  * The exact nature of this split is a (sub)arch decision which can
48  * select one of three main variants:
49  *
50  * CONFIG_SEPARATE_XENHEAP=y
51  *
52  *   The xen heap is maintained as an entirely separate heap.
53  *
54  *   Arch code arranges for some (perhaps small) amount of physical
55  *   memory to be covered by a direct mapping and registers that
56  *   memory as the Xen heap (via init_xenheap_pages()) and the
57  *   remainder as the dom heap.
58  *
59  *   This mode of operation is most commonly used by 32-bit arches
60  *   where the virtual address space is insufficient to map all RAM.
61  *
62  * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ALL RAM
63  *
64  *   All of RAM is covered by a permanent contiguous mapping and there
65  *   is only a single heap.
66  *
67  *   Memory allocated from the Xen heap is flagged (in
68  *   page_info.count_info) with PGC_xen_heap. Memory allocated from
69  *   the Dom heap must still be explicitly mapped before use
70  *   (e.g. with map_domain_page) in particular in common code.
71  *
72  *   xenheap_max_mfn() should not be called by arch code.
73  *
74  *   This mode of operation is most commonly used by 64-bit arches
75  *   which have sufficient free virtual address space to permanently
76  *   map the largest practical amount RAM currently expected on that
77  *   arch.
78  *
79  * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ONLY PARTIAL RAM
80  *
81  *   There is a single heap, but only the beginning (up to some
82  *   threshold) is covered by a permanent contiguous mapping.
83  *
84  *   Memory allocated from the Xen heap is allocated from below the
85  *   threshold and flagged with PGC_xen_heap. Memory allocated from
86  *   the dom heap is allocated from anywhere in the heap (although it
87  *   will prefer to allocate from as high as possible to try and keep
88  *   Xen heap suitable memory available).
89  *
90  *   Arch code must call xenheap_max_mfn() to signal the limit of the
91  *   direct mapping.
92  *
93  *   This mode of operation is most commonly used by 64-bit arches
94  *   which have a restricted amount of virtual address space available
95  *   for a direct map (due to e.g. reservations for other purposes)
96  *   such that it is not possible to map all of RAM on systems with
97  *   the largest practical amount of RAM currently expected on that
98  *   arch.
99  *
100  * Boot Allocator
101  *
102  *   In addition to the two primary pools (xen heap and dom heap) a
103  *   third "boot allocator" is used at start of day. This is a
104  *   simplified allocator which can be used.
105  *
106  *   Typically all memory which is destined to be dom heap memory
107  *   (which is everything in the CONFIG_SEPARATE_XENHEAP=n
108  *   configurations) is first allocated to the boot allocator (with
109  *   init_boot_pages()) and is then handed over to the main dom heap in
110  *   end_boot_allocator().
111  *
112  * "Contiguous" mappings
113  *
114  *   Note that although the above talks about "contiguous" mappings
115  *   some architectures implement a scheme ("PDX compression") to
116  *   compress unused portions of the machine address space (i.e. large
117  *   gaps between distinct banks of memory) in order to avoid creating
118  *   enormous frame tables and direct maps which mostly map
119  *   nothing. Thus a contiguous mapping may still have distinct
120  *   regions within it.
121  */
122 
123 #include <xen/init.h>
124 #include <xen/types.h>
125 #include <xen/lib.h>
126 #include <xen/sched.h>
127 #include <xen/spinlock.h>
128 #include <xen/mm.h>
129 #include <xen/param.h>
130 #include <xen/irq.h>
131 #include <xen/softirq.h>
132 #include <xen/domain_page.h>
133 #include <xen/keyhandler.h>
134 #include <xen/perfc.h>
135 #include <xen/pfn.h>
136 #include <xen/numa.h>
137 #include <xen/nodemask.h>
138 #include <xen/event.h>
139 #include <public/sysctl.h>
140 #include <public/sched.h>
141 #include <asm/page.h>
142 #include <asm/numa.h>
143 #include <asm/flushtlb.h>
144 #ifdef CONFIG_X86
145 #include <asm/guest.h>
146 #include <asm/p2m.h>
147 #include <asm/setup.h> /* for highmem_start only */
148 #include <asm/paging.h>
149 #else
150 #define p2m_pod_offline_or_broken_hit(pg) 0
151 #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL)
152 #endif
153 
154 /*
155  * Comma-separated list of hexadecimal page numbers containing bad bytes.
156  * e.g. 'badpage=0x3f45,0x8a321'.
157  */
158 static char __initdata opt_badpage[100] = "";
159 string_param("badpage", opt_badpage);
160 
161 /*
162  * no-bootscrub -> Free pages are not zeroed during boot.
163  */
164 enum bootscrub_mode {
165     BOOTSCRUB_OFF,
166     BOOTSCRUB_ON,
167     BOOTSCRUB_IDLE,
168 };
169 
170 /*
171  * opt_bootscrub should live in the init section, since it's not accessed
172  * afterwards. However at least LLVM assumes there are no side effects of
173  * accessing the variable, and optimizes the condition in init_heap_pages() so
174  * opt_bootscrub is read regardless of the value of system_state:
175  * https://bugs.llvm.org/show_bug.cgi?id=39707
176  */
177 static enum bootscrub_mode __read_mostly opt_bootscrub = BOOTSCRUB_IDLE;
parse_bootscrub_param(const char * s)178 static int __init parse_bootscrub_param(const char *s)
179 {
180     /* Interpret 'bootscrub' alone in its positive boolean form */
181     if ( *s == '\0' )
182     {
183         opt_bootscrub = BOOTSCRUB_ON;
184         return 0;
185     }
186 
187     switch ( parse_bool(s, NULL) )
188     {
189     case 0:
190         opt_bootscrub = BOOTSCRUB_OFF;
191         break;
192 
193     case 1:
194         opt_bootscrub = BOOTSCRUB_ON;
195         break;
196 
197     default:
198         if ( !strcmp(s, "idle") )
199             opt_bootscrub = BOOTSCRUB_IDLE;
200         else
201             return -EINVAL;
202         break;
203     }
204 
205     return 0;
206 }
207 custom_param("bootscrub", parse_bootscrub_param);
208 
209 /*
210  * bootscrub_chunk -> Amount of bytes to scrub lockstep on non-SMT CPUs
211  * on all NUMA nodes.
212  */
213 static unsigned long __initdata opt_bootscrub_chunk = MB(128);
214 size_param("bootscrub_chunk", opt_bootscrub_chunk);
215 
216  /* scrub-domheap -> Domheap pages are scrubbed when freed */
217 static bool __read_mostly opt_scrub_domheap;
218 boolean_param("scrub-domheap", opt_scrub_domheap);
219 
220 #ifdef CONFIG_SCRUB_DEBUG
221 static bool __read_mostly scrub_debug;
222 #else
223 #define scrub_debug    false
224 #endif
225 
226 /*
227  * Bit width of the DMA heap -- used to override NUMA-node-first.
228  * allocation strategy, which can otherwise exhaust low memory.
229  */
230 static unsigned int dma_bitsize;
231 integer_param("dma_bits", dma_bitsize);
232 
233 /* Offlined page list, protected by heap_lock. */
234 PAGE_LIST_HEAD(page_offlined_list);
235 /* Broken page list, protected by heap_lock. */
236 PAGE_LIST_HEAD(page_broken_list);
237 
238 /*************************
239  * BOOT-TIME ALLOCATOR
240  */
241 
242 /*
243  * first_valid_mfn is exported because it is use in ARM specific NUMA
244  * helpers. See comment in asm-arm/numa.h.
245  */
246 mfn_t first_valid_mfn = INVALID_MFN_INITIALIZER;
247 
248 struct bootmem_region {
249     unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
250 };
251 /* Statically allocate a page for bootmem_region_list. */
252 static struct bootmem_region __initdata
253     bootmem_region_list[PAGE_SIZE / sizeof(struct bootmem_region)];
254 static unsigned int __initdata nr_bootmem_regions;
255 
256 struct scrub_region {
257     unsigned long offset;
258     unsigned long start;
259     unsigned long per_cpu_sz;
260     unsigned long rem;
261     cpumask_t cpus;
262 };
263 static struct scrub_region __initdata region[MAX_NUMNODES];
264 static unsigned long __initdata chunk_size;
265 
bootmem_region_add(unsigned long s,unsigned long e)266 static void __init bootmem_region_add(unsigned long s, unsigned long e)
267 {
268     unsigned int i;
269 
270     if ( s >= e )
271         return;
272 
273     for ( i = 0; i < nr_bootmem_regions; i++ )
274         if ( s < bootmem_region_list[i].e )
275             break;
276 
277     BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
278     BUG_ON(nr_bootmem_regions == (PAGE_SIZE / sizeof(struct bootmem_region)));
279 
280     memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
281             (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
282     bootmem_region_list[i] = (struct bootmem_region) { s, e };
283     nr_bootmem_regions++;
284 }
285 
bootmem_region_zap(unsigned long s,unsigned long e)286 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
287 {
288     unsigned int i;
289 
290     for ( i = 0; i < nr_bootmem_regions; i++ )
291     {
292         struct bootmem_region *r = &bootmem_region_list[i];
293         if ( e <= r->s )
294             break;
295         if ( s >= r->e )
296             continue;
297         if ( s <= r->s )
298         {
299             r->s = min(e, r->e);
300         }
301         else if ( e >= r->e )
302         {
303             r->e = s;
304         }
305         else
306         {
307             unsigned long _e = r->e;
308             r->e = s;
309             bootmem_region_add(e, _e);
310         }
311     }
312 }
313 
init_boot_pages(paddr_t ps,paddr_t pe)314 void __init init_boot_pages(paddr_t ps, paddr_t pe)
315 {
316     unsigned long bad_spfn, bad_epfn;
317     const char *p;
318 #ifdef CONFIG_X86
319     const struct platform_bad_page *badpage;
320     unsigned int i, array_size;
321 
322     BUILD_BUG_ON(8 * sizeof(frame_table->u.free.first_dirty) <
323                  MAX_ORDER + 1);
324 #endif
325     BUILD_BUG_ON(sizeof(frame_table->u) != sizeof(unsigned long));
326 
327     ps = round_pgup(ps);
328     pe = round_pgdown(pe);
329     if ( pe <= ps )
330         return;
331 
332     first_valid_mfn = mfn_min(maddr_to_mfn(ps), first_valid_mfn);
333 
334     bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
335 
336 #ifdef CONFIG_X86
337     /*
338      * Here we put platform-specific memory range workarounds, i.e.
339      * memory known to be corrupt or otherwise in need to be reserved on
340      * specific platforms.
341      * We get these certain pages and remove them from memory region list.
342      */
343     badpage = get_platform_badpages(&array_size);
344     if ( badpage )
345     {
346         for ( i = 0; i < array_size; i++ )
347         {
348             bootmem_region_zap(badpage->mfn,
349                                badpage->mfn + (1UL << badpage->order));
350             badpage++;
351         }
352     }
353 
354     if ( pv_shim )
355     {
356         badpage = pv_shim_reserved_pages(&array_size);
357         if ( badpage )
358         {
359             for ( i = 0; i < array_size; i++ )
360             {
361                 bootmem_region_zap(badpage->mfn,
362                                    badpage->mfn + (1UL << badpage->order));
363                 badpage++;
364             }
365         }
366     }
367 #endif
368 
369     /* Check new pages against the bad-page list. */
370     p = opt_badpage;
371     while ( *p != '\0' )
372     {
373         bad_spfn = simple_strtoul(p, &p, 0);
374         bad_epfn = bad_spfn;
375 
376         if ( *p == '-' )
377         {
378             p++;
379             bad_epfn = simple_strtoul(p, &p, 0);
380             if ( bad_epfn < bad_spfn )
381                 bad_epfn = bad_spfn;
382         }
383 
384         if ( *p == ',' )
385             p++;
386         else if ( *p != '\0' )
387             break;
388 
389         bootmem_region_zap(bad_spfn, bad_epfn+1);
390     }
391 }
392 
alloc_boot_pages(unsigned long nr_pfns,unsigned long pfn_align)393 mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
394 {
395     unsigned long pg, _e;
396     unsigned int i = nr_bootmem_regions;
397 
398     BUG_ON(!nr_bootmem_regions);
399 
400     while ( i-- )
401     {
402         struct bootmem_region *r = &bootmem_region_list[i];
403 
404         pg = (r->e - nr_pfns) & ~(pfn_align - 1);
405         if ( pg >= r->e || pg < r->s )
406             continue;
407 
408 #if defined(CONFIG_X86) && !defined(NDEBUG)
409         /*
410          * Filtering pfn_align == 1 since the only allocations using a bigger
411          * alignment are the ones used for setting up the frame table chunks.
412          * Those allocations get remapped anyway, i.e. them not having 1:1
413          * mappings always accessible is not a problem.
414          */
415         if ( highmem_start && pfn_align == 1 &&
416              r->e > PFN_DOWN(highmem_start) )
417         {
418             pg = r->s;
419             if ( pg + nr_pfns > PFN_DOWN(highmem_start) )
420                 continue;
421             r->s = pg + nr_pfns;
422             return _mfn(pg);
423         }
424 #endif
425 
426         _e = r->e;
427         r->e = pg;
428         bootmem_region_add(pg + nr_pfns, _e);
429         return _mfn(pg);
430     }
431 
432     BUG();
433 }
434 
435 
436 
437 /*************************
438  * BINARY BUDDY ALLOCATOR
439  */
440 
441 #define MEMZONE_XEN 0
442 #define NR_ZONES    (PADDR_BITS - PAGE_SHIFT + 1)
443 
444 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
445 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
446                           (flsl(mfn_x(page_to_mfn(pg))) ? : 1))
447 
448 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
449 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
450 #define heap(node, zone, order) ((*_heap[node])[zone][order])
451 
452 static unsigned long node_need_scrub[MAX_NUMNODES];
453 
454 static unsigned long *avail[MAX_NUMNODES];
455 static long total_avail_pages;
456 
457 static DEFINE_SPINLOCK(heap_lock);
458 static long outstanding_claims; /* total outstanding claims by all domains */
459 
domain_adjust_tot_pages(struct domain * d,long pages)460 unsigned long domain_adjust_tot_pages(struct domain *d, long pages)
461 {
462     long dom_before, dom_after, dom_claimed, sys_before, sys_after;
463 
464     ASSERT(spin_is_locked(&d->page_alloc_lock));
465     d->tot_pages += pages;
466 
467     /*
468      * can test d->claimed_pages race-free because it can only change
469      * if d->page_alloc_lock and heap_lock are both held, see also
470      * domain_set_outstanding_pages below
471      */
472     if ( !d->outstanding_pages )
473         goto out;
474 
475     spin_lock(&heap_lock);
476     /* adjust domain outstanding pages; may not go negative */
477     dom_before = d->outstanding_pages;
478     dom_after = dom_before - pages;
479     BUG_ON(dom_before < 0);
480     dom_claimed = dom_after < 0 ? 0 : dom_after;
481     d->outstanding_pages = dom_claimed;
482     /* flag accounting bug if system outstanding_claims would go negative */
483     sys_before = outstanding_claims;
484     sys_after = sys_before - (dom_before - dom_claimed);
485     BUG_ON(sys_after < 0);
486     outstanding_claims = sys_after;
487     spin_unlock(&heap_lock);
488 
489 out:
490     return d->tot_pages;
491 }
492 
domain_set_outstanding_pages(struct domain * d,unsigned long pages)493 int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
494 {
495     int ret = -ENOMEM;
496     unsigned long claim, avail_pages;
497 
498     /*
499      * take the domain's page_alloc_lock, else all d->tot_page adjustments
500      * must always take the global heap_lock rather than only in the much
501      * rarer case that d->outstanding_pages is non-zero
502      */
503     spin_lock(&d->page_alloc_lock);
504     spin_lock(&heap_lock);
505 
506     /* pages==0 means "unset" the claim. */
507     if ( pages == 0 )
508     {
509         outstanding_claims -= d->outstanding_pages;
510         d->outstanding_pages = 0;
511         ret = 0;
512         goto out;
513     }
514 
515     /* only one active claim per domain please */
516     if ( d->outstanding_pages )
517     {
518         ret = -EINVAL;
519         goto out;
520     }
521 
522     /* disallow a claim not exceeding domain_tot_pages() or above max_pages */
523     if ( (pages <= domain_tot_pages(d)) || (pages > d->max_pages) )
524     {
525         ret = -EINVAL;
526         goto out;
527     }
528 
529     /* how much memory is available? */
530     avail_pages = total_avail_pages;
531 
532     avail_pages -= outstanding_claims;
533 
534     /*
535      * Note, if domain has already allocated memory before making a claim
536      * then the claim must take domain_tot_pages() into account
537      */
538     claim = pages - domain_tot_pages(d);
539     if ( claim > avail_pages )
540         goto out;
541 
542     /* yay, claim fits in available memory, stake the claim, success! */
543     d->outstanding_pages = claim;
544     outstanding_claims += d->outstanding_pages;
545     ret = 0;
546 
547 out:
548     spin_unlock(&heap_lock);
549     spin_unlock(&d->page_alloc_lock);
550     return ret;
551 }
552 
get_outstanding_claims(uint64_t * free_pages,uint64_t * outstanding_pages)553 void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages)
554 {
555     spin_lock(&heap_lock);
556     *outstanding_pages = outstanding_claims;
557     *free_pages =  avail_domheap_pages();
558     spin_unlock(&heap_lock);
559 }
560 
561 static bool __read_mostly first_node_initialised;
562 #ifndef CONFIG_SEPARATE_XENHEAP
563 static unsigned int __read_mostly xenheap_bits;
564 #else
565 #define xenheap_bits 0
566 #endif
567 
init_node_heap(int node,unsigned long mfn,unsigned long nr,bool * use_tail)568 static unsigned long init_node_heap(int node, unsigned long mfn,
569                                     unsigned long nr, bool *use_tail)
570 {
571     /* First node to be discovered has its heap metadata statically alloced. */
572     static heap_by_zone_and_order_t _heap_static;
573     static unsigned long avail_static[NR_ZONES];
574     unsigned long needed = (sizeof(**_heap) +
575                             sizeof(**avail) * NR_ZONES +
576                             PAGE_SIZE - 1) >> PAGE_SHIFT;
577     int i, j;
578 
579     if ( !first_node_initialised )
580     {
581         _heap[node] = &_heap_static;
582         avail[node] = avail_static;
583         first_node_initialised = true;
584         needed = 0;
585     }
586     else if ( *use_tail && nr >= needed &&
587               arch_mfn_in_directmap(mfn + nr) &&
588               (!xenheap_bits ||
589                !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
590     {
591         _heap[node] = mfn_to_virt(mfn + nr - needed);
592         avail[node] = mfn_to_virt(mfn + nr - 1) +
593                       PAGE_SIZE - sizeof(**avail) * NR_ZONES;
594     }
595     else if ( nr >= needed &&
596               arch_mfn_in_directmap(mfn + needed) &&
597               (!xenheap_bits ||
598                !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
599     {
600         _heap[node] = mfn_to_virt(mfn);
601         avail[node] = mfn_to_virt(mfn + needed - 1) +
602                       PAGE_SIZE - sizeof(**avail) * NR_ZONES;
603         *use_tail = false;
604     }
605     else if ( get_order_from_bytes(sizeof(**_heap)) ==
606               get_order_from_pages(needed) )
607     {
608         _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
609         BUG_ON(!_heap[node]);
610         avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
611                       sizeof(**avail) * NR_ZONES;
612         needed = 0;
613     }
614     else
615     {
616         _heap[node] = xmalloc(heap_by_zone_and_order_t);
617         avail[node] = xmalloc_array(unsigned long, NR_ZONES);
618         BUG_ON(!_heap[node] || !avail[node]);
619         needed = 0;
620     }
621 
622     memset(avail[node], 0, NR_ZONES * sizeof(long));
623 
624     for ( i = 0; i < NR_ZONES; i++ )
625         for ( j = 0; j <= MAX_ORDER; j++ )
626             INIT_PAGE_LIST_HEAD(&heap(node, i, j));
627 
628     return needed;
629 }
630 
631 /* Default to 64 MiB */
632 #define DEFAULT_LOW_MEM_VIRQ    (((paddr_t) 64)   << 20)
633 #define MAX_LOW_MEM_VIRQ        (((paddr_t) 1024) << 20)
634 
635 static paddr_t __read_mostly opt_low_mem_virq = ((paddr_t) -1);
636 size_param("low_mem_virq_limit", opt_low_mem_virq);
637 
638 /* Thresholds to control hysteresis. In pages */
639 /* When memory grows above this threshold, reset hysteresis.
640  * -1 initially to not reset until at least one virq issued. */
641 static unsigned long low_mem_virq_high      = -1UL;
642 /* Threshold at which we issue virq */
643 static unsigned long low_mem_virq_th        = 0;
644 /* Original threshold after all checks completed */
645 static unsigned long low_mem_virq_orig      = 0;
646 /* Order for current threshold */
647 static unsigned int  low_mem_virq_th_order  = 0;
648 
649 /* Perform bootstrapping checks and set bounds */
setup_low_mem_virq(void)650 static void __init setup_low_mem_virq(void)
651 {
652     unsigned int order;
653     paddr_t threshold;
654     bool halve;
655 
656     /* If the user specifies zero, then he/she doesn't want this virq
657      * to ever trigger. */
658     if ( opt_low_mem_virq == 0 )
659     {
660         low_mem_virq_th = -1UL;
661         return;
662     }
663 
664     /* If the user did not specify a knob, remember that */
665     halve = (opt_low_mem_virq == ((paddr_t) -1));
666     threshold = halve ? DEFAULT_LOW_MEM_VIRQ : opt_low_mem_virq;
667 
668     /* Dom0 has already been allocated by now. So check we won't be
669      * complaining immediately with whatever's left of the heap. */
670     threshold = min(threshold,
671                     ((paddr_t) total_avail_pages) << PAGE_SHIFT);
672 
673     /* Then, cap to some predefined maximum */
674     threshold = min(threshold, MAX_LOW_MEM_VIRQ);
675 
676     /* If the user specified no knob, and we are at the current available
677      * level, halve the threshold. */
678     if ( halve &&
679          (threshold == (((paddr_t) total_avail_pages) << PAGE_SHIFT)) )
680         threshold >>= 1;
681 
682     /* Zero? Have to fire immediately */
683     threshold = max(threshold, (paddr_t) PAGE_SIZE);
684 
685     /* Threshold bytes -> pages */
686     low_mem_virq_th = threshold >> PAGE_SHIFT;
687 
688     /* Next, round the threshold down to the next order */
689     order = get_order_from_pages(low_mem_virq_th);
690     if ( (1UL << order) > low_mem_virq_th )
691         order--;
692 
693     /* Set bounds, ready to go */
694     low_mem_virq_th = low_mem_virq_orig = 1UL << order;
695     low_mem_virq_th_order = order;
696 
697     printk("Initial low memory virq threshold set at %#lx pages.\n",
698             low_mem_virq_th);
699 }
700 
check_low_mem_virq(void)701 static void check_low_mem_virq(void)
702 {
703     unsigned long avail_pages = total_avail_pages - outstanding_claims;
704 
705     if ( unlikely(avail_pages <= low_mem_virq_th) )
706     {
707         send_global_virq(VIRQ_ENOMEM);
708 
709         /* Update thresholds. Next warning will be when we drop below
710          * next order. However, we wait until we grow beyond one
711          * order above us to complain again at the current order */
712         low_mem_virq_high   = 1UL << (low_mem_virq_th_order + 1);
713         if ( low_mem_virq_th_order > 0 )
714             low_mem_virq_th_order--;
715         low_mem_virq_th     = 1UL << low_mem_virq_th_order;
716         return;
717     }
718 
719     if ( unlikely(avail_pages >= low_mem_virq_high) )
720     {
721         /* Reset hysteresis. Bring threshold up one order.
722          * If we are back where originally set, set high
723          * threshold to -1 to avoid further growth of
724          * virq threshold. */
725         low_mem_virq_th_order++;
726         low_mem_virq_th = 1UL << low_mem_virq_th_order;
727         if ( low_mem_virq_th == low_mem_virq_orig )
728             low_mem_virq_high = -1UL;
729         else
730             low_mem_virq_high = 1UL << (low_mem_virq_th_order + 2);
731     }
732 }
733 
734 /* Pages that need a scrub are added to tail, otherwise to head. */
page_list_add_scrub(struct page_info * pg,unsigned int node,unsigned int zone,unsigned int order,unsigned int first_dirty)735 static void page_list_add_scrub(struct page_info *pg, unsigned int node,
736                                 unsigned int zone, unsigned int order,
737                                 unsigned int first_dirty)
738 {
739     PFN_ORDER(pg) = order;
740     pg->u.free.first_dirty = first_dirty;
741     pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
742 
743     if ( first_dirty != INVALID_DIRTY_IDX )
744     {
745         ASSERT(first_dirty < (1U << order));
746         page_list_add_tail(pg, &heap(node, zone, order));
747     }
748     else
749         page_list_add(pg, &heap(node, zone, order));
750 }
751 
752 /* SCRUB_PATTERN needs to be a repeating series of bytes. */
753 #ifndef NDEBUG
754 #define SCRUB_PATTERN        0xc2c2c2c2c2c2c2c2ULL
755 #else
756 #define SCRUB_PATTERN        0ULL
757 #endif
758 #define SCRUB_BYTE_PATTERN   (SCRUB_PATTERN & 0xff)
759 
poison_one_page(struct page_info * pg)760 static void poison_one_page(struct page_info *pg)
761 {
762 #ifdef CONFIG_SCRUB_DEBUG
763     uint64_t *ptr;
764 
765     if ( !scrub_debug )
766         return;
767 
768     ptr = __map_domain_page(pg);
769     *ptr = ~SCRUB_PATTERN;
770     unmap_domain_page(ptr);
771 #endif
772 }
773 
check_one_page(struct page_info * pg)774 static void check_one_page(struct page_info *pg)
775 {
776 #ifdef CONFIG_SCRUB_DEBUG
777     const uint64_t *ptr;
778     unsigned int i;
779 
780     if ( !scrub_debug )
781         return;
782 
783     ptr = __map_domain_page(pg);
784     for ( i = 0; i < PAGE_SIZE / sizeof (*ptr); i++ )
785         BUG_ON(ptr[i] != SCRUB_PATTERN);
786     unmap_domain_page(ptr);
787 #endif
788 }
789 
check_and_stop_scrub(struct page_info * head)790 static void check_and_stop_scrub(struct page_info *head)
791 {
792     if ( head->u.free.scrub_state == BUDDY_SCRUBBING )
793     {
794         typeof(head->u.free) pgfree;
795 
796         head->u.free.scrub_state = BUDDY_SCRUB_ABORT;
797         spin_lock_kick();
798         for ( ; ; )
799         {
800             /* Can't ACCESS_ONCE() a bitfield. */
801             pgfree.val = ACCESS_ONCE(head->u.free.val);
802             if ( pgfree.scrub_state != BUDDY_SCRUB_ABORT )
803                 break;
804             cpu_relax();
805         }
806     }
807 }
808 
get_free_buddy(unsigned int zone_lo,unsigned int zone_hi,unsigned int order,unsigned int memflags,const struct domain * d)809 static struct page_info *get_free_buddy(unsigned int zone_lo,
810                                         unsigned int zone_hi,
811                                         unsigned int order, unsigned int memflags,
812                                         const struct domain *d)
813 {
814     nodeid_t first, node = MEMF_get_node(memflags), req_node = node;
815     nodemask_t nodemask = node_online_map;
816     unsigned int j, zone, nodemask_retry = 0;
817     struct page_info *pg;
818     bool use_unscrubbed = (memflags & MEMF_no_scrub);
819 
820     /*
821      * d->node_affinity is our preferred allocation set if provided, but it
822      * may have bits set outside of node_online_map.  Clamp it.
823      */
824     if ( d )
825     {
826         /*
827          * It is the callers responsibility to ensure that d->node_affinity
828          * isn't complete junk.
829          */
830         if ( nodes_intersects(nodemask, d->node_affinity) )
831             nodes_and(nodemask, nodemask, d->node_affinity);
832         else
833             ASSERT_UNREACHABLE();
834     }
835 
836     if ( node == NUMA_NO_NODE )
837     {
838         if ( d != NULL )
839             node = cycle_node(d->last_alloc_node, nodemask);
840 
841         if ( node >= MAX_NUMNODES )
842             node = cpu_to_node(smp_processor_id());
843     }
844     else if ( unlikely(node >= MAX_NUMNODES) )
845     {
846         ASSERT_UNREACHABLE();
847         return NULL;
848     }
849     first = node;
850 
851     /*
852      * Start with requested node, but exhaust all node memory in requested
853      * zone before failing, only calc new node value if we fail to find memory
854      * in target node, this avoids needless computation on fast-path.
855      */
856     for ( ; ; )
857     {
858         zone = zone_hi;
859         do {
860             /* Check if target node can support the allocation. */
861             if ( !avail[node] || (avail[node][zone] < (1UL << order)) )
862                 continue;
863 
864             /* Find smallest order which can satisfy the request. */
865             for ( j = order; j <= MAX_ORDER; j++ )
866             {
867                 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
868                 {
869                     if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
870                         return pg;
871                     /*
872                      * We grab single pages (order=0) even if they are
873                      * unscrubbed. Given that scrubbing one page is fairly quick
874                      * it is not worth breaking higher orders.
875                      */
876                     if ( (order == 0) || use_unscrubbed )
877                     {
878                         check_and_stop_scrub(pg);
879                         return pg;
880                     }
881 
882                     page_list_add_tail(pg, &heap(node, zone, j));
883                 }
884             }
885         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
886 
887         if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
888             return NULL;
889 
890         /* Pick next node. */
891         if ( !nodemask_test(node, &nodemask) )
892         {
893             /* Very first node may be caller-specified and outside nodemask. */
894             ASSERT(!nodemask_retry);
895             first = node = first_node(nodemask);
896             if ( node < MAX_NUMNODES )
897                 continue;
898         }
899         else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
900             node = first_node(nodemask);
901         if ( node == first )
902         {
903             /* When we have tried all in nodemask, we fall back to others. */
904             if ( (memflags & MEMF_exact_node) || nodemask_retry++ )
905                 return NULL;
906             nodes_andnot(nodemask, node_online_map, nodemask);
907             first = node = first_node(nodemask);
908             if ( node >= MAX_NUMNODES )
909                 return NULL;
910         }
911     }
912 }
913 
914 /* Allocate 2^@order contiguous pages. */
alloc_heap_pages(unsigned int zone_lo,unsigned int zone_hi,unsigned int order,unsigned int memflags,struct domain * d)915 static struct page_info *alloc_heap_pages(
916     unsigned int zone_lo, unsigned int zone_hi,
917     unsigned int order, unsigned int memflags,
918     struct domain *d)
919 {
920     nodeid_t node;
921     unsigned int i, buddy_order, zone, first_dirty;
922     unsigned long request = 1UL << order;
923     struct page_info *pg;
924     bool need_tlbflush = false;
925     uint32_t tlbflush_timestamp = 0;
926     unsigned int dirty_cnt = 0;
927 
928     /* Make sure there are enough bits in memflags for nodeID. */
929     BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
930 
931     ASSERT(zone_lo <= zone_hi);
932     ASSERT(zone_hi < NR_ZONES);
933 
934     if ( unlikely(order > MAX_ORDER) )
935         return NULL;
936 
937     spin_lock(&heap_lock);
938 
939     /*
940      * Claimed memory is considered unavailable unless the request
941      * is made by a domain with sufficient unclaimed pages.
942      */
943     if ( (outstanding_claims + request > total_avail_pages) &&
944           ((memflags & MEMF_no_refcount) ||
945            !d || d->outstanding_pages < request) )
946     {
947         spin_unlock(&heap_lock);
948         return NULL;
949     }
950 
951     pg = get_free_buddy(zone_lo, zone_hi, order, memflags, d);
952     /* Try getting a dirty buddy if we couldn't get a clean one. */
953     if ( !pg && !(memflags & MEMF_no_scrub) )
954         pg = get_free_buddy(zone_lo, zone_hi, order,
955                             memflags | MEMF_no_scrub, d);
956     if ( !pg )
957     {
958         /* No suitable memory blocks. Fail the request. */
959         spin_unlock(&heap_lock);
960         return NULL;
961     }
962 
963     node = phys_to_nid(page_to_maddr(pg));
964     zone = page_to_zone(pg);
965     buddy_order = PFN_ORDER(pg);
966 
967     first_dirty = pg->u.free.first_dirty;
968 
969     /* We may have to halve the chunk a number of times. */
970     while ( buddy_order != order )
971     {
972         buddy_order--;
973         page_list_add_scrub(pg, node, zone, buddy_order,
974                             (1U << buddy_order) > first_dirty ?
975                             first_dirty : INVALID_DIRTY_IDX);
976         pg += 1U << buddy_order;
977 
978         if ( first_dirty != INVALID_DIRTY_IDX )
979         {
980             /* Adjust first_dirty */
981             if ( first_dirty >= 1U << buddy_order )
982                 first_dirty -= 1U << buddy_order;
983             else
984                 first_dirty = 0; /* We've moved past original first_dirty */
985         }
986     }
987 
988     ASSERT(avail[node][zone] >= request);
989     avail[node][zone] -= request;
990     total_avail_pages -= request;
991     ASSERT(total_avail_pages >= 0);
992 
993     check_low_mem_virq();
994 
995     if ( d != NULL )
996         d->last_alloc_node = node;
997 
998     for ( i = 0; i < (1 << order); i++ )
999     {
1000         /* Reference count must continuously be zero for free pages. */
1001         if ( (pg[i].count_info & ~PGC_need_scrub) != PGC_state_free )
1002         {
1003             printk(XENLOG_ERR
1004                    "pg[%u] MFN %"PRI_mfn" c=%#lx o=%u v=%#lx t=%#x\n",
1005                    i, mfn_x(page_to_mfn(pg + i)),
1006                    pg[i].count_info, pg[i].v.free.order,
1007                    pg[i].u.free.val, pg[i].tlbflush_timestamp);
1008             BUG();
1009         }
1010 
1011         /* PGC_need_scrub can only be set if first_dirty is valid */
1012         ASSERT(first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub));
1013 
1014         /* Preserve PGC_need_scrub so we can check it after lock is dropped. */
1015         pg[i].count_info = PGC_state_inuse | (pg[i].count_info & PGC_need_scrub);
1016 
1017         if ( !(memflags & MEMF_no_tlbflush) )
1018             accumulate_tlbflush(&need_tlbflush, &pg[i],
1019                                 &tlbflush_timestamp);
1020 
1021         /* Initialise fields which have other uses for free pages. */
1022         pg[i].u.inuse.type_info = 0;
1023         page_set_owner(&pg[i], NULL);
1024 
1025         /* Ensure cache and RAM are consistent for platforms where the
1026          * guest can control its own visibility of/through the cache.
1027          */
1028         flush_page_to_ram(mfn_x(page_to_mfn(&pg[i])),
1029                           !(memflags & MEMF_no_icache_flush));
1030     }
1031 
1032     spin_unlock(&heap_lock);
1033 
1034     if ( first_dirty != INVALID_DIRTY_IDX ||
1035          (scrub_debug && !(memflags & MEMF_no_scrub)) )
1036     {
1037         for ( i = 0; i < (1U << order); i++ )
1038         {
1039             if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
1040             {
1041                 if ( !(memflags & MEMF_no_scrub) )
1042                     scrub_one_page(&pg[i]);
1043 
1044                 dirty_cnt++;
1045 
1046                 spin_lock(&heap_lock);
1047                 pg[i].count_info &= ~PGC_need_scrub;
1048                 spin_unlock(&heap_lock);
1049             }
1050             else if ( !(memflags & MEMF_no_scrub) )
1051                 check_one_page(&pg[i]);
1052         }
1053 
1054         if ( dirty_cnt )
1055         {
1056             spin_lock(&heap_lock);
1057             node_need_scrub[node] -= dirty_cnt;
1058             spin_unlock(&heap_lock);
1059         }
1060     }
1061 
1062     if ( need_tlbflush )
1063         filtered_flush_tlb_mask(tlbflush_timestamp);
1064 
1065     return pg;
1066 }
1067 
1068 /* Remove any offlined page in the buddy pointed to by head. */
reserve_offlined_page(struct page_info * head)1069 static int reserve_offlined_page(struct page_info *head)
1070 {
1071     unsigned int node = phys_to_nid(page_to_maddr(head));
1072     int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
1073     struct page_info *cur_head;
1074     unsigned int cur_order, first_dirty;
1075 
1076     ASSERT(spin_is_locked(&heap_lock));
1077 
1078     cur_head = head;
1079 
1080     check_and_stop_scrub(head);
1081     /*
1082      * We may break the buddy so let's mark the head as clean. Then, when
1083      * merging chunks back into the heap, we will see whether the chunk has
1084      * unscrubbed pages and set its first_dirty properly.
1085      */
1086     first_dirty = head->u.free.first_dirty;
1087     head->u.free.first_dirty = INVALID_DIRTY_IDX;
1088 
1089     page_list_del(head, &heap(node, zone, head_order));
1090 
1091     while ( cur_head < (head + (1 << head_order)) )
1092     {
1093         struct page_info *pg;
1094         int next_order;
1095 
1096         if ( page_state_is(cur_head, offlined) )
1097         {
1098             cur_head++;
1099             if ( first_dirty != INVALID_DIRTY_IDX && first_dirty )
1100                 first_dirty--;
1101             continue;
1102         }
1103 
1104         next_order = cur_order = 0;
1105 
1106         while ( cur_order < head_order )
1107         {
1108             next_order = cur_order + 1;
1109 
1110             if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
1111                 goto merge;
1112 
1113             for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
1114                   i < (1 << next_order);
1115                   i++, pg++ )
1116                 if ( page_state_is(pg, offlined) )
1117                     break;
1118             if ( i == ( 1 << next_order) )
1119             {
1120                 cur_order = next_order;
1121                 continue;
1122             }
1123             else
1124             {
1125             merge:
1126                 /* We don't consider merging outside the head_order. */
1127                 page_list_add_scrub(cur_head, node, zone, cur_order,
1128                                     (1U << cur_order) > first_dirty ?
1129                                     first_dirty : INVALID_DIRTY_IDX);
1130                 cur_head += (1 << cur_order);
1131 
1132                 /* Adjust first_dirty if needed. */
1133                 if ( first_dirty != INVALID_DIRTY_IDX )
1134                 {
1135                     if ( first_dirty >=  1U << cur_order )
1136                         first_dirty -= 1U << cur_order;
1137                     else
1138                         first_dirty = 0;
1139                 }
1140 
1141                 break;
1142             }
1143         }
1144     }
1145 
1146     for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
1147     {
1148         if ( !page_state_is(cur_head, offlined) )
1149             continue;
1150 
1151         avail[node][zone]--;
1152         total_avail_pages--;
1153         ASSERT(total_avail_pages >= 0);
1154 
1155         page_list_add_tail(cur_head,
1156                            test_bit(_PGC_broken, &cur_head->count_info) ?
1157                            &page_broken_list : &page_offlined_list);
1158 
1159         count++;
1160     }
1161 
1162     return count;
1163 }
1164 
1165 static nodemask_t node_scrubbing;
1166 
1167 /*
1168  * If get_node is true this will return closest node that needs to be scrubbed,
1169  * with appropriate bit in node_scrubbing set.
1170  * If get_node is not set, this will return *a* node that needs to be scrubbed.
1171  * node_scrubbing bitmask will no be updated.
1172  * If no node needs scrubbing then NUMA_NO_NODE is returned.
1173  */
node_to_scrub(bool get_node)1174 static unsigned int node_to_scrub(bool get_node)
1175 {
1176     nodeid_t node = cpu_to_node(smp_processor_id()), local_node;
1177     nodeid_t closest = NUMA_NO_NODE;
1178     u8 dist, shortest = 0xff;
1179 
1180     if ( node == NUMA_NO_NODE )
1181         node = 0;
1182 
1183     if ( node_need_scrub[node] &&
1184          (!get_node || !node_test_and_set(node, node_scrubbing)) )
1185         return node;
1186 
1187     /*
1188      * See if there are memory-only nodes that need scrubbing and choose
1189      * the closest one.
1190      */
1191     local_node = node;
1192     for ( ; ; )
1193     {
1194         do {
1195             node = cycle_node(node, node_online_map);
1196         } while ( !cpumask_empty(&node_to_cpumask(node)) &&
1197                   (node != local_node) );
1198 
1199         if ( node == local_node )
1200             break;
1201 
1202         if ( node_need_scrub[node] )
1203         {
1204             if ( !get_node )
1205                 return node;
1206 
1207             dist = __node_distance(local_node, node);
1208 
1209             /*
1210              * Grab the node right away. If we find a closer node later we will
1211              * release this one. While there is a chance that another CPU will
1212              * not be able to scrub that node when it is searching for scrub work
1213              * at the same time it will be able to do so next time it wakes up.
1214              * The alternative would be to perform this search under a lock but
1215              * then we'd need to take this lock every time we come in here.
1216              */
1217             if ( (dist < shortest || closest == NUMA_NO_NODE) &&
1218                  !node_test_and_set(node, node_scrubbing) )
1219             {
1220                 if ( closest != NUMA_NO_NODE )
1221                     node_clear(closest, node_scrubbing);
1222                 shortest = dist;
1223                 closest = node;
1224             }
1225         }
1226     }
1227 
1228     return closest;
1229 }
1230 
1231 struct scrub_wait_state {
1232     struct page_info *pg;
1233     unsigned int first_dirty;
1234     bool drop;
1235 };
1236 
scrub_continue(void * data)1237 static void scrub_continue(void *data)
1238 {
1239     struct scrub_wait_state *st = data;
1240 
1241     if ( st->drop )
1242         return;
1243 
1244     if ( st->pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1245     {
1246         /* There is a waiter for this buddy. Release it. */
1247         st->drop = true;
1248         st->pg->u.free.first_dirty = st->first_dirty;
1249         smp_wmb();
1250         st->pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1251     }
1252 }
1253 
scrub_free_pages(void)1254 bool scrub_free_pages(void)
1255 {
1256     struct page_info *pg;
1257     unsigned int zone;
1258     unsigned int cpu = smp_processor_id();
1259     bool preempt = false;
1260     nodeid_t node;
1261     unsigned int cnt = 0;
1262 
1263     node = node_to_scrub(true);
1264     if ( node == NUMA_NO_NODE )
1265         return false;
1266 
1267     spin_lock(&heap_lock);
1268 
1269     for ( zone = 0; zone < NR_ZONES; zone++ )
1270     {
1271         unsigned int order = MAX_ORDER;
1272 
1273         do {
1274             while ( !page_list_empty(&heap(node, zone, order)) )
1275             {
1276                 unsigned int i, dirty_cnt;
1277                 struct scrub_wait_state st;
1278 
1279                 /* Unscrubbed pages are always at the end of the list. */
1280                 pg = page_list_last(&heap(node, zone, order));
1281                 if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
1282                     break;
1283 
1284                 ASSERT(pg->u.free.scrub_state == BUDDY_NOT_SCRUBBING);
1285                 pg->u.free.scrub_state = BUDDY_SCRUBBING;
1286 
1287                 spin_unlock(&heap_lock);
1288 
1289                 dirty_cnt = 0;
1290 
1291                 for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
1292                 {
1293                     if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
1294                     {
1295                         scrub_one_page(&pg[i]);
1296                         /*
1297                          * We can modify count_info without holding heap
1298                          * lock since we effectively locked this buddy by
1299                          * setting its scrub_state.
1300                          */
1301                         pg[i].count_info &= ~PGC_need_scrub;
1302                         dirty_cnt++;
1303                         cnt += 100; /* scrubbed pages add heavier weight. */
1304                     }
1305                     else
1306                         cnt++;
1307 
1308                     if ( pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1309                     {
1310                         /* Someone wants this chunk. Drop everything. */
1311 
1312                         pg->u.free.first_dirty = (i == (1U << order) - 1) ?
1313                             INVALID_DIRTY_IDX : i + 1;
1314                         smp_wmb();
1315                         pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1316 
1317                         spin_lock(&heap_lock);
1318                         node_need_scrub[node] -= dirty_cnt;
1319                         spin_unlock(&heap_lock);
1320                         goto out_nolock;
1321                     }
1322 
1323                     /*
1324                      * Scrub a few (8) pages before becoming eligible for
1325                      * preemption. But also count non-scrubbing loop iterations
1326                      * so that we don't get stuck here with an almost clean
1327                      * heap.
1328                      */
1329                     if ( cnt > 800 && softirq_pending(cpu) )
1330                     {
1331                         preempt = true;
1332                         break;
1333                     }
1334                 }
1335 
1336                 st.pg = pg;
1337                 /*
1338                  * get_free_buddy() grabs a buddy with first_dirty set to
1339                  * INVALID_DIRTY_IDX so we can't set pg's first_dirty here.
1340                  * It will be set either below or in the lock callback (in
1341                  * scrub_continue()).
1342                  */
1343                 st.first_dirty = (i >= (1U << order) - 1) ?
1344                     INVALID_DIRTY_IDX : i + 1;
1345                 st.drop = false;
1346                 spin_lock_cb(&heap_lock, scrub_continue, &st);
1347 
1348                 node_need_scrub[node] -= dirty_cnt;
1349 
1350                 if ( st.drop )
1351                     goto out;
1352 
1353                 if ( i >= (1U << order) - 1 )
1354                 {
1355                     page_list_del(pg, &heap(node, zone, order));
1356                     page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
1357                 }
1358                 else
1359                     pg->u.free.first_dirty = i + 1;
1360 
1361                 pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1362 
1363                 if ( preempt || (node_need_scrub[node] == 0) )
1364                     goto out;
1365             }
1366         } while ( order-- != 0 );
1367     }
1368 
1369  out:
1370     spin_unlock(&heap_lock);
1371 
1372  out_nolock:
1373     node_clear(node, node_scrubbing);
1374     return node_to_scrub(false) != NUMA_NO_NODE;
1375 }
1376 
1377 /* Free 2^@order set of pages. */
free_heap_pages(struct page_info * pg,unsigned int order,bool need_scrub)1378 static void free_heap_pages(
1379     struct page_info *pg, unsigned int order, bool need_scrub)
1380 {
1381     unsigned long mask;
1382     mfn_t mfn = page_to_mfn(pg);
1383     unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
1384     unsigned int zone = page_to_zone(pg);
1385 
1386     ASSERT(order <= MAX_ORDER);
1387     ASSERT(node >= 0);
1388 
1389     spin_lock(&heap_lock);
1390 
1391     for ( i = 0; i < (1 << order); i++ )
1392     {
1393         /*
1394          * Cannot assume that count_info == 0, as there are some corner cases
1395          * where it isn't the case and yet it isn't a bug:
1396          *  1. page_get_owner() is NULL
1397          *  2. page_get_owner() is a domain that was never accessible by
1398          *     its domid (e.g., failed to fully construct the domain).
1399          *  3. page was never addressable by the guest (e.g., it's an
1400          *     auto-translate-physmap guest and the page was never included
1401          *     in its pseudophysical address space).
1402          * In all the above cases there can be no guest mappings of this page.
1403          */
1404         switch ( pg[i].count_info & PGC_state )
1405         {
1406         case PGC_state_inuse:
1407             BUG_ON(pg[i].count_info & PGC_broken);
1408             pg[i].count_info = PGC_state_free;
1409             break;
1410 
1411         case PGC_state_offlining:
1412             pg[i].count_info = (pg[i].count_info & PGC_broken) |
1413                                PGC_state_offlined;
1414             tainted = 1;
1415             break;
1416 
1417         default:
1418             printk(XENLOG_ERR
1419                    "pg[%u] MFN %"PRI_mfn" c=%#lx o=%u v=%#lx t=%#x\n",
1420                    i, mfn_x(page_to_mfn(pg + i)),
1421                    pg[i].count_info, pg[i].v.free.order,
1422                    pg[i].u.free.val, pg[i].tlbflush_timestamp);
1423             BUG();
1424         }
1425 
1426         /* If a page has no owner it will need no safety TLB flush. */
1427         pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
1428         if ( pg[i].u.free.need_tlbflush )
1429             page_set_tlbflush_timestamp(&pg[i]);
1430 
1431         /* This page is not a guest frame any more. */
1432         page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
1433         set_gpfn_from_mfn(mfn_x(mfn) + i, INVALID_M2P_ENTRY);
1434 
1435         if ( need_scrub )
1436         {
1437             pg[i].count_info |= PGC_need_scrub;
1438             poison_one_page(&pg[i]);
1439         }
1440     }
1441 
1442     avail[node][zone] += 1 << order;
1443     total_avail_pages += 1 << order;
1444     if ( need_scrub )
1445     {
1446         node_need_scrub[node] += 1 << order;
1447         pg->u.free.first_dirty = 0;
1448     }
1449     else
1450         pg->u.free.first_dirty = INVALID_DIRTY_IDX;
1451 
1452     /* Merge chunks as far as possible. */
1453     while ( order < MAX_ORDER )
1454     {
1455         mask = 1UL << order;
1456 
1457         if ( (mfn_x(page_to_mfn(pg)) & mask) )
1458         {
1459             struct page_info *predecessor = pg - mask;
1460 
1461             /* Merge with predecessor block? */
1462             if ( !mfn_valid(page_to_mfn(predecessor)) ||
1463                  !page_state_is(predecessor, free) ||
1464                  (PFN_ORDER(predecessor) != order) ||
1465                  (phys_to_nid(page_to_maddr(predecessor)) != node) )
1466                 break;
1467 
1468             check_and_stop_scrub(predecessor);
1469 
1470             page_list_del(predecessor, &heap(node, zone, order));
1471 
1472             /* Update predecessor's first_dirty if necessary. */
1473             if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
1474                  pg->u.free.first_dirty != INVALID_DIRTY_IDX )
1475                 predecessor->u.free.first_dirty = (1U << order) +
1476                                                   pg->u.free.first_dirty;
1477 
1478             pg = predecessor;
1479         }
1480         else
1481         {
1482             struct page_info *successor = pg + mask;
1483 
1484             /* Merge with successor block? */
1485             if ( !mfn_valid(page_to_mfn(successor)) ||
1486                  !page_state_is(successor, free) ||
1487                  (PFN_ORDER(successor) != order) ||
1488                  (phys_to_nid(page_to_maddr(successor)) != node) )
1489                 break;
1490 
1491             check_and_stop_scrub(successor);
1492 
1493             /* Update pg's first_dirty if necessary. */
1494             if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX &&
1495                  successor->u.free.first_dirty != INVALID_DIRTY_IDX )
1496                 pg->u.free.first_dirty = (1U << order) +
1497                                          successor->u.free.first_dirty;
1498 
1499             page_list_del(successor, &heap(node, zone, order));
1500         }
1501 
1502         order++;
1503     }
1504 
1505     page_list_add_scrub(pg, node, zone, order, pg->u.free.first_dirty);
1506 
1507     if ( tainted )
1508         reserve_offlined_page(pg);
1509 
1510     spin_unlock(&heap_lock);
1511 }
1512 
1513 
1514 /*
1515  * Following rules applied for page offline:
1516  * Once a page is broken, it can't be assigned anymore
1517  * A page will be offlined only if it is free
1518  * return original count_info
1519  */
mark_page_offline(struct page_info * pg,int broken)1520 static unsigned long mark_page_offline(struct page_info *pg, int broken)
1521 {
1522     unsigned long nx, x, y = pg->count_info;
1523 
1524     ASSERT(page_is_ram_type(mfn_x(page_to_mfn(pg)), RAM_TYPE_CONVENTIONAL));
1525     ASSERT(spin_is_locked(&heap_lock));
1526 
1527     do {
1528         nx = x = y;
1529 
1530         if ( ((x & PGC_state) != PGC_state_offlined) &&
1531              ((x & PGC_state) != PGC_state_offlining) )
1532         {
1533             nx &= ~PGC_state;
1534             nx |= (((x & PGC_state) == PGC_state_free)
1535                    ? PGC_state_offlined : PGC_state_offlining);
1536         }
1537 
1538         if ( broken )
1539             nx |= PGC_broken;
1540 
1541         if ( x == nx )
1542             break;
1543     } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1544 
1545     return y;
1546 }
1547 
reserve_heap_page(struct page_info * pg)1548 static int reserve_heap_page(struct page_info *pg)
1549 {
1550     struct page_info *head = NULL;
1551     unsigned int i, node = phys_to_nid(page_to_maddr(pg));
1552     unsigned int zone = page_to_zone(pg);
1553 
1554     for ( i = 0; i <= MAX_ORDER; i++ )
1555     {
1556         struct page_info *tmp;
1557 
1558         if ( page_list_empty(&heap(node, zone, i)) )
1559             continue;
1560 
1561         page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
1562         {
1563             if ( (head <= pg) &&
1564                  (head + (1UL << i) > pg) )
1565                 return reserve_offlined_page(head);
1566         }
1567     }
1568 
1569     return -EINVAL;
1570 
1571 }
1572 
offline_page(mfn_t mfn,int broken,uint32_t * status)1573 int offline_page(mfn_t mfn, int broken, uint32_t *status)
1574 {
1575     unsigned long old_info = 0;
1576     struct domain *owner;
1577     struct page_info *pg;
1578 
1579     if ( !mfn_valid(mfn) )
1580     {
1581         dprintk(XENLOG_WARNING,
1582                 "try to offline out of range page %"PRI_mfn"\n", mfn_x(mfn));
1583         return -EINVAL;
1584     }
1585 
1586     *status = 0;
1587     pg = mfn_to_page(mfn);
1588 
1589     if ( is_xen_fixed_mfn(mfn) )
1590     {
1591         *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
1592           (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1593         return -EPERM;
1594     }
1595 
1596     /*
1597      * N.B. xen's txt in x86_64 is marked reserved and handled already.
1598      * Also kexec range is reserved.
1599      */
1600     if ( !page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL) )
1601     {
1602         *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
1603         return -EINVAL;
1604     }
1605 
1606     /*
1607      * NB. When broken page belong to guest, usually hypervisor will
1608      * notify the guest to handle the broken page. However, hypervisor
1609      * need to prevent malicious guest access the broken page again.
1610      * Under such case, hypervisor shutdown guest, preventing recursive mce.
1611      */
1612     if ( (pg->count_info & PGC_broken) && (owner = page_get_owner(pg)) )
1613     {
1614         *status = PG_OFFLINE_AGAIN;
1615         domain_crash(owner);
1616         return 0;
1617     }
1618 
1619     spin_lock(&heap_lock);
1620 
1621     old_info = mark_page_offline(pg, broken);
1622 
1623     if ( page_state_is(pg, offlined) )
1624     {
1625         reserve_heap_page(pg);
1626 
1627         spin_unlock(&heap_lock);
1628 
1629         *status = broken ? PG_OFFLINE_OFFLINED | PG_OFFLINE_BROKEN
1630                          : PG_OFFLINE_OFFLINED;
1631         return 0;
1632     }
1633 
1634     spin_unlock(&heap_lock);
1635 
1636     if ( (owner = page_get_owner_and_reference(pg)) )
1637     {
1638         if ( p2m_pod_offline_or_broken_hit(pg) )
1639         {
1640             put_page(pg);
1641             p2m_pod_offline_or_broken_replace(pg);
1642             *status = PG_OFFLINE_OFFLINED;
1643         }
1644         else
1645         {
1646             *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
1647                       (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
1648             /* Release the reference since it will not be allocated anymore */
1649             put_page(pg);
1650         }
1651     }
1652     else if ( old_info & PGC_xen_heap )
1653     {
1654         *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
1655                   (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1656     }
1657     else
1658     {
1659         /*
1660          * assign_pages does not hold heap_lock, so small window that the owner
1661          * may be set later, but please notice owner will only change from
1662          * NULL to be set, not verse, since page is offlining now.
1663          * No windows If called from #MC handler, since all CPU are in softirq
1664          * If called from user space like CE handling, tools can wait some time
1665          * before call again.
1666          */
1667         *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
1668                   (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
1669     }
1670 
1671     if ( broken )
1672         *status |= PG_OFFLINE_BROKEN;
1673 
1674     return 0;
1675 }
1676 
1677 /*
1678  * Online the memory.
1679  *   The caller should make sure end_pfn <= max_page,
1680  *   if not, expand_pages() should be called prior to online_page().
1681  */
online_page(mfn_t mfn,uint32_t * status)1682 unsigned int online_page(mfn_t mfn, uint32_t *status)
1683 {
1684     unsigned long x, nx, y;
1685     struct page_info *pg;
1686     int ret;
1687 
1688     if ( !mfn_valid(mfn) )
1689     {
1690         dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1691         return -EINVAL;
1692     }
1693 
1694     pg = mfn_to_page(mfn);
1695 
1696     spin_lock(&heap_lock);
1697 
1698     y = pg->count_info;
1699     do {
1700         ret = *status = 0;
1701 
1702         if ( y & PGC_broken )
1703         {
1704             ret = -EINVAL;
1705             *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
1706             break;
1707         }
1708 
1709         if ( (y & PGC_state) == PGC_state_offlined )
1710         {
1711             page_list_del(pg, &page_offlined_list);
1712             *status = PG_ONLINE_ONLINED;
1713         }
1714         else if ( (y & PGC_state) == PGC_state_offlining )
1715         {
1716             *status = PG_ONLINE_ONLINED;
1717         }
1718         else
1719         {
1720             break;
1721         }
1722 
1723         x = y;
1724         nx = (x & ~PGC_state) | PGC_state_inuse;
1725     } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1726 
1727     spin_unlock(&heap_lock);
1728 
1729     if ( (y & PGC_state) == PGC_state_offlined )
1730         free_heap_pages(pg, 0, false);
1731 
1732     return ret;
1733 }
1734 
query_page_offline(mfn_t mfn,uint32_t * status)1735 int query_page_offline(mfn_t mfn, uint32_t *status)
1736 {
1737     struct page_info *pg;
1738 
1739     if ( !mfn_valid(mfn) || !page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL) )
1740     {
1741         dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1742         return -EINVAL;
1743     }
1744 
1745     *status = 0;
1746     spin_lock(&heap_lock);
1747 
1748     pg = mfn_to_page(mfn);
1749 
1750     if ( page_state_is(pg, offlining) )
1751         *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
1752     if ( pg->count_info & PGC_broken )
1753         *status |= PG_OFFLINE_STATUS_BROKEN;
1754     if ( page_state_is(pg, offlined) )
1755         *status |= PG_OFFLINE_STATUS_OFFLINED;
1756 
1757     spin_unlock(&heap_lock);
1758 
1759     return 0;
1760 }
1761 
1762 /*
1763  * Hand the specified arbitrary page range to the specified heap zone
1764  * checking the node_id of the previous page.  If they differ and the
1765  * latter is not on a MAX_ORDER boundary, then we reserve the page by
1766  * not freeing it to the buddy allocator.
1767  */
init_heap_pages(struct page_info * pg,unsigned long nr_pages)1768 static void init_heap_pages(
1769     struct page_info *pg, unsigned long nr_pages)
1770 {
1771     unsigned long i;
1772     bool idle_scrub = false;
1773 
1774     /*
1775      * Keep MFN 0 away from the buddy allocator to avoid crossing zone
1776      * boundary when merging two buddies.
1777      */
1778     if ( !mfn_x(page_to_mfn(pg)) )
1779     {
1780         if ( nr_pages-- <= 1 )
1781             return;
1782         pg++;
1783     }
1784 
1785 
1786     /*
1787      * Some pages may not go through the boot allocator (e.g reserved
1788      * memory at boot but released just after --- kernel, initramfs,
1789      * etc.).
1790      * Update first_valid_mfn to ensure those regions are covered.
1791      */
1792     spin_lock(&heap_lock);
1793     first_valid_mfn = mfn_min(page_to_mfn(pg), first_valid_mfn);
1794     spin_unlock(&heap_lock);
1795 
1796     if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE )
1797         idle_scrub = true;
1798 
1799     for ( i = 0; i < nr_pages; i++ )
1800     {
1801         unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
1802 
1803         if ( unlikely(!avail[nid]) )
1804         {
1805             unsigned long s = mfn_x(page_to_mfn(pg + i));
1806             unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1));
1807             bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
1808                             !(s & ((1UL << MAX_ORDER) - 1)) &&
1809                             (find_first_set_bit(e) <= find_first_set_bit(s));
1810             unsigned long n;
1811 
1812             n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), nr_pages - i,
1813                                &use_tail);
1814             BUG_ON(i + n > nr_pages);
1815             if ( n && !use_tail )
1816             {
1817                 i += n - 1;
1818                 continue;
1819             }
1820             if ( i + n == nr_pages )
1821                 break;
1822             nr_pages -= n;
1823         }
1824 
1825         free_heap_pages(pg + i, 0, scrub_debug || idle_scrub);
1826     }
1827 }
1828 
avail_heap_pages(unsigned int zone_lo,unsigned int zone_hi,unsigned int node)1829 static unsigned long avail_heap_pages(
1830     unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
1831 {
1832     unsigned int i, zone;
1833     unsigned long free_pages = 0;
1834 
1835     if ( zone_hi >= NR_ZONES )
1836         zone_hi = NR_ZONES - 1;
1837 
1838     for_each_online_node(i)
1839     {
1840         if ( !avail[i] )
1841             continue;
1842         for ( zone = zone_lo; zone <= zone_hi; zone++ )
1843             if ( (node == -1) || (node == i) )
1844                 free_pages += avail[i][zone];
1845     }
1846 
1847     return free_pages;
1848 }
1849 
end_boot_allocator(void)1850 void __init end_boot_allocator(void)
1851 {
1852     unsigned int i;
1853 
1854     /* Pages that are free now go to the domain sub-allocator. */
1855     for ( i = 0; i < nr_bootmem_regions; i++ )
1856     {
1857         struct bootmem_region *r = &bootmem_region_list[i];
1858         if ( (r->s < r->e) &&
1859              (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
1860         {
1861             init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
1862             r->e = r->s;
1863             break;
1864         }
1865     }
1866     for ( i = nr_bootmem_regions; i-- > 0; )
1867     {
1868         struct bootmem_region *r = &bootmem_region_list[i];
1869         if ( r->s < r->e )
1870             init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
1871     }
1872     nr_bootmem_regions = 0;
1873 
1874     if ( !dma_bitsize && (num_online_nodes() > 1) )
1875         dma_bitsize = arch_get_dma_bitsize();
1876 
1877     printk("Domain heap initialised");
1878     if ( dma_bitsize )
1879         printk(" DMA width %u bits", dma_bitsize);
1880     printk("\n");
1881 }
1882 
smp_scrub_heap_pages(void * data)1883 static void __init smp_scrub_heap_pages(void *data)
1884 {
1885     unsigned long mfn, start, end;
1886     struct page_info *pg;
1887     struct scrub_region *r;
1888     unsigned int temp_cpu, cpu_idx = 0;
1889     nodeid_t node;
1890     unsigned int cpu = smp_processor_id();
1891 
1892     if ( data )
1893         r = data;
1894     else
1895     {
1896         node = cpu_to_node(cpu);
1897         if ( node == NUMA_NO_NODE )
1898             return;
1899         r = &region[node];
1900     }
1901 
1902     /* Determine the current CPU's index into CPU's linked to this node. */
1903     for_each_cpu ( temp_cpu, &r->cpus )
1904     {
1905         if ( cpu == temp_cpu )
1906             break;
1907         cpu_idx++;
1908     }
1909 
1910     /* Calculate the starting mfn for this CPU's memory block. */
1911     start = r->start + (r->per_cpu_sz * cpu_idx) + r->offset;
1912 
1913     /* Calculate the end mfn into this CPU's memory block for this iteration. */
1914     if ( r->offset + chunk_size >= r->per_cpu_sz )
1915     {
1916         end = r->start + (r->per_cpu_sz * cpu_idx) + r->per_cpu_sz;
1917 
1918         if ( r->rem && (cpumask_weight(&r->cpus) - 1 == cpu_idx) )
1919             end += r->rem;
1920     }
1921     else
1922         end = start + chunk_size;
1923 
1924     for ( mfn = start; mfn < end; mfn++ )
1925     {
1926         pg = mfn_to_page(_mfn(mfn));
1927 
1928         /* Check the mfn is valid and page is free. */
1929         if ( !mfn_valid(_mfn(mfn)) || !page_state_is(pg, free) )
1930             continue;
1931 
1932         scrub_one_page(pg);
1933     }
1934 }
1935 
find_non_smt(unsigned int node,cpumask_t * dest)1936 static int __init find_non_smt(unsigned int node, cpumask_t *dest)
1937 {
1938     cpumask_t node_cpus;
1939     unsigned int i, cpu;
1940 
1941     cpumask_and(&node_cpus, &node_to_cpumask(node), &cpu_online_map);
1942     cpumask_clear(dest);
1943     for_each_cpu ( i, &node_cpus )
1944     {
1945         if ( cpumask_intersects(dest, per_cpu(cpu_sibling_mask, i)) )
1946             continue;
1947         cpu = cpumask_first(per_cpu(cpu_sibling_mask, i));
1948         __cpumask_set_cpu(cpu, dest);
1949     }
1950     return cpumask_weight(dest);
1951 }
1952 
1953 /*
1954  * Scrub all unallocated pages in all heap zones. This function uses all
1955  * online cpu's to scrub the memory in parallel.
1956  */
scrub_heap_pages(void)1957 static void __init scrub_heap_pages(void)
1958 {
1959     cpumask_t node_cpus, all_worker_cpus;
1960     unsigned int i, j;
1961     unsigned long offset, max_per_cpu_sz = 0;
1962     unsigned long start, end;
1963     unsigned long rem = 0;
1964     int last_distance, best_node;
1965     int cpus;
1966 
1967     cpumask_clear(&all_worker_cpus);
1968     /* Scrub block size. */
1969     chunk_size = opt_bootscrub_chunk >> PAGE_SHIFT;
1970     if ( chunk_size == 0 )
1971         chunk_size = MB(128) >> PAGE_SHIFT;
1972 
1973     /* Round #0 - figure out amounts and which CPUs to use. */
1974     for_each_online_node ( i )
1975     {
1976         if ( !node_spanned_pages(i) )
1977             continue;
1978         /* Calculate Node memory start and end address. */
1979         start = max(node_start_pfn(i), mfn_x(first_valid_mfn));
1980         end = min(node_start_pfn(i) + node_spanned_pages(i), max_page);
1981         /* Just in case NODE has 1 page and starts below first_valid_mfn. */
1982         end = max(end, start);
1983         /* CPUs that are online and on this node (if none, that it is OK). */
1984         cpus = find_non_smt(i, &node_cpus);
1985         cpumask_or(&all_worker_cpus, &all_worker_cpus, &node_cpus);
1986         if ( cpus <= 0 )
1987         {
1988             /* No CPUs on this node. Round #2 will take of it. */
1989             rem = 0;
1990             region[i].per_cpu_sz = (end - start);
1991         }
1992         else
1993         {
1994             rem = (end - start) % cpus;
1995             region[i].per_cpu_sz = (end - start) / cpus;
1996             if ( region[i].per_cpu_sz > max_per_cpu_sz )
1997                 max_per_cpu_sz = region[i].per_cpu_sz;
1998         }
1999         region[i].start = start;
2000         region[i].rem = rem;
2001         cpumask_copy(&region[i].cpus, &node_cpus);
2002     }
2003 
2004     printk("Scrubbing Free RAM on %d nodes using %d CPUs\n", num_online_nodes(),
2005            cpumask_weight(&all_worker_cpus));
2006 
2007     /* Round: #1 - do NUMA nodes with CPUs. */
2008     for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
2009     {
2010         for_each_online_node ( i )
2011             region[i].offset = offset;
2012 
2013         process_pending_softirqs();
2014 
2015         spin_lock(&heap_lock);
2016         on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, NULL, 1);
2017         spin_unlock(&heap_lock);
2018 
2019         printk(".");
2020     }
2021 
2022     /*
2023      * Round #2: NUMA nodes with no CPUs get scrubbed with CPUs on the node
2024      * closest to us and with CPUs.
2025      */
2026     for_each_online_node ( i )
2027     {
2028         node_cpus = node_to_cpumask(i);
2029 
2030         if ( !cpumask_empty(&node_cpus) )
2031             continue;
2032 
2033         last_distance = INT_MAX;
2034         best_node = first_node(node_online_map);
2035         /* Figure out which NODE CPUs are close. */
2036         for_each_online_node ( j )
2037         {
2038             u8 distance;
2039 
2040             if ( cpumask_empty(&node_to_cpumask(j)) )
2041                 continue;
2042 
2043             distance = __node_distance(i, j);
2044             if ( (distance < last_distance) && (distance != NUMA_NO_DISTANCE) )
2045             {
2046                 last_distance = distance;
2047                 best_node = j;
2048             }
2049         }
2050         /*
2051          * Use CPUs from best node, and if there are no CPUs on the
2052          * first node (the default) use the BSP.
2053          */
2054         cpus = find_non_smt(best_node, &node_cpus);
2055         if ( cpus == 0 )
2056         {
2057             __cpumask_set_cpu(smp_processor_id(), &node_cpus);
2058             cpus = 1;
2059         }
2060         /* We already have the node information from round #0. */
2061         region[i].rem = region[i].per_cpu_sz % cpus;
2062         region[i].per_cpu_sz /= cpus;
2063         max_per_cpu_sz = region[i].per_cpu_sz;
2064         cpumask_copy(&region[i].cpus, &node_cpus);
2065 
2066         for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
2067         {
2068             region[i].offset = offset;
2069 
2070             process_pending_softirqs();
2071 
2072             spin_lock(&heap_lock);
2073             on_selected_cpus(&node_cpus, smp_scrub_heap_pages, &region[i], 1);
2074             spin_unlock(&heap_lock);
2075 
2076             printk(".");
2077         }
2078     }
2079 
2080     printk("done.\n");
2081 
2082 #ifdef CONFIG_SCRUB_DEBUG
2083     scrub_debug = true;
2084 #endif
2085 }
2086 
heap_init_late(void)2087 void __init heap_init_late(void)
2088 {
2089     /*
2090      * Now that the heap is initialized set bounds
2091      * for the low mem virq algorithm.
2092      */
2093     setup_low_mem_virq();
2094 
2095     switch ( opt_bootscrub )
2096     {
2097     default:
2098         ASSERT_UNREACHABLE();
2099         /* Fall through */
2100 
2101     case BOOTSCRUB_IDLE:
2102         printk("Scrubbing Free RAM in background\n");
2103         break;
2104 
2105     case BOOTSCRUB_ON:
2106         scrub_heap_pages();
2107         break;
2108 
2109     case BOOTSCRUB_OFF:
2110         break;
2111     }
2112 }
2113 
2114 
2115 /*************************
2116  * XEN-HEAP SUB-ALLOCATOR
2117  */
2118 
2119 #if defined(CONFIG_SEPARATE_XENHEAP)
2120 
init_xenheap_pages(paddr_t ps,paddr_t pe)2121 void init_xenheap_pages(paddr_t ps, paddr_t pe)
2122 {
2123     ps = round_pgup(ps);
2124     pe = round_pgdown(pe);
2125     if ( pe <= ps )
2126         return;
2127 
2128     /*
2129      * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
2130      * prevent merging of power-of-two blocks across the zone boundary.
2131      */
2132     if ( ps && !is_xen_heap_mfn(mfn_add(maddr_to_mfn(ps), -1)) )
2133         ps += PAGE_SIZE;
2134     if ( !is_xen_heap_mfn(maddr_to_mfn(pe)) )
2135         pe -= PAGE_SIZE;
2136 
2137     memguard_guard_range(maddr_to_virt(ps), pe - ps);
2138 
2139     init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
2140 }
2141 
2142 
alloc_xenheap_pages(unsigned int order,unsigned int memflags)2143 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2144 {
2145     struct page_info *pg;
2146 
2147     ASSERT(!in_irq());
2148 
2149     pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
2150                           order, memflags | MEMF_no_scrub, NULL);
2151     if ( unlikely(pg == NULL) )
2152         return NULL;
2153 
2154     memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
2155 
2156     return page_to_virt(pg);
2157 }
2158 
2159 
free_xenheap_pages(void * v,unsigned int order)2160 void free_xenheap_pages(void *v, unsigned int order)
2161 {
2162     ASSERT(!in_irq());
2163 
2164     if ( v == NULL )
2165         return;
2166 
2167     memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
2168 
2169     free_heap_pages(virt_to_page(v), order, false);
2170 }
2171 
2172 #else  /* !CONFIG_SEPARATE_XENHEAP */
2173 
xenheap_max_mfn(unsigned long mfn)2174 void __init xenheap_max_mfn(unsigned long mfn)
2175 {
2176     ASSERT(!first_node_initialised);
2177     ASSERT(!xenheap_bits);
2178     BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
2179     xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
2180     printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
2181 }
2182 
init_xenheap_pages(paddr_t ps,paddr_t pe)2183 void init_xenheap_pages(paddr_t ps, paddr_t pe)
2184 {
2185     init_domheap_pages(ps, pe);
2186 }
2187 
alloc_xenheap_pages(unsigned int order,unsigned int memflags)2188 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2189 {
2190     struct page_info *pg;
2191     unsigned int i;
2192 
2193     ASSERT(!in_irq());
2194 
2195     if ( xenheap_bits && (memflags >> _MEMF_bits) > xenheap_bits )
2196         memflags &= ~MEMF_bits(~0U);
2197     if ( !(memflags >> _MEMF_bits) )
2198         memflags |= MEMF_bits(xenheap_bits);
2199 
2200     pg = alloc_domheap_pages(NULL, order, memflags | MEMF_no_scrub);
2201     if ( unlikely(pg == NULL) )
2202         return NULL;
2203 
2204     for ( i = 0; i < (1u << order); i++ )
2205         pg[i].count_info |= PGC_xen_heap;
2206 
2207     return page_to_virt(pg);
2208 }
2209 
free_xenheap_pages(void * v,unsigned int order)2210 void free_xenheap_pages(void *v, unsigned int order)
2211 {
2212     struct page_info *pg;
2213     unsigned int i;
2214 
2215     ASSERT(!in_irq());
2216 
2217     if ( v == NULL )
2218         return;
2219 
2220     pg = virt_to_page(v);
2221 
2222     for ( i = 0; i < (1u << order); i++ )
2223         pg[i].count_info &= ~PGC_xen_heap;
2224 
2225     free_heap_pages(pg, order, true);
2226 }
2227 
2228 #endif  /* CONFIG_SEPARATE_XENHEAP */
2229 
2230 
2231 
2232 /*************************
2233  * DOMAIN-HEAP SUB-ALLOCATOR
2234  */
2235 
init_domheap_pages(paddr_t ps,paddr_t pe)2236 void init_domheap_pages(paddr_t ps, paddr_t pe)
2237 {
2238     mfn_t smfn, emfn;
2239 
2240     ASSERT(!in_irq());
2241 
2242     smfn = maddr_to_mfn(round_pgup(ps));
2243     emfn = maddr_to_mfn(round_pgdown(pe));
2244 
2245     if ( mfn_x(emfn) <= mfn_x(smfn) )
2246         return;
2247 
2248     init_heap_pages(mfn_to_page(smfn), mfn_x(emfn) - mfn_x(smfn));
2249 }
2250 
2251 
assign_pages(struct domain * d,struct page_info * pg,unsigned int order,unsigned int memflags)2252 int assign_pages(
2253     struct domain *d,
2254     struct page_info *pg,
2255     unsigned int order,
2256     unsigned int memflags)
2257 {
2258     int rc = 0;
2259     unsigned long i;
2260 
2261     spin_lock(&d->page_alloc_lock);
2262 
2263     if ( unlikely(d->is_dying) )
2264     {
2265         gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
2266                 d->domain_id);
2267         rc = -EINVAL;
2268         goto out;
2269     }
2270 
2271 #ifndef NDEBUG
2272     {
2273         unsigned int extra_pages = 0;
2274 
2275         for ( i = 0; i < (1ul << order); i++ )
2276         {
2277             ASSERT(!(pg[i].count_info & ~PGC_extra));
2278             if ( pg[i].count_info & PGC_extra )
2279                 extra_pages++;
2280         }
2281 
2282         ASSERT(!extra_pages ||
2283                ((memflags & MEMF_no_refcount) &&
2284                 extra_pages == 1u << order));
2285     }
2286 #endif
2287 
2288     if ( pg[0].count_info & PGC_extra )
2289     {
2290         d->extra_pages += 1u << order;
2291         memflags &= ~MEMF_no_refcount;
2292     }
2293     else if ( !(memflags & MEMF_no_refcount) )
2294     {
2295         unsigned int tot_pages = domain_tot_pages(d) + (1 << order);
2296 
2297         if ( unlikely(tot_pages > d->max_pages) )
2298         {
2299             gprintk(XENLOG_INFO, "Over-allocation for domain %u: "
2300                     "%u > %u\n", d->domain_id, tot_pages, d->max_pages);
2301             rc = -E2BIG;
2302             goto out;
2303         }
2304     }
2305 
2306     if ( !(memflags & MEMF_no_refcount) &&
2307          unlikely(domain_adjust_tot_pages(d, 1 << order) == (1 << order)) )
2308         get_knownalive_domain(d);
2309 
2310     for ( i = 0; i < (1 << order); i++ )
2311     {
2312         ASSERT(page_get_owner(&pg[i]) == NULL);
2313         page_set_owner(&pg[i], d);
2314         smp_wmb(); /* Domain pointer must be visible before updating refcnt. */
2315         pg[i].count_info =
2316             (pg[i].count_info & PGC_extra) | PGC_allocated | 1;
2317         page_list_add_tail(&pg[i], page_to_list(d, &pg[i]));
2318     }
2319 
2320  out:
2321     spin_unlock(&d->page_alloc_lock);
2322     return rc;
2323 }
2324 
2325 
alloc_domheap_pages(struct domain * d,unsigned int order,unsigned int memflags)2326 struct page_info *alloc_domheap_pages(
2327     struct domain *d, unsigned int order, unsigned int memflags)
2328 {
2329     struct page_info *pg = NULL;
2330     unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
2331     unsigned int dma_zone;
2332 
2333     ASSERT(!in_irq());
2334 
2335     bits = domain_clamp_alloc_bitsize(memflags & MEMF_no_owner ? NULL : d,
2336                                       bits ? : (BITS_PER_LONG+PAGE_SHIFT));
2337     if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
2338         return NULL;
2339 
2340     if ( memflags & MEMF_no_owner )
2341         memflags |= MEMF_no_refcount;
2342 
2343     if ( !dma_bitsize )
2344         memflags &= ~MEMF_no_dma;
2345     else if ( (dma_zone = bits_to_zone(dma_bitsize)) < zone_hi )
2346         pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
2347 
2348     if ( (pg == NULL) &&
2349          ((memflags & MEMF_no_dma) ||
2350           ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
2351                                   memflags, d)) == NULL)) )
2352          return NULL;
2353 
2354     if ( d && !(memflags & MEMF_no_owner) )
2355     {
2356         if ( memflags & MEMF_no_refcount )
2357         {
2358             unsigned long i;
2359 
2360             for ( i = 0; i < (1ul << order); i++ )
2361             {
2362                 ASSERT(!pg[i].count_info);
2363                 pg[i].count_info = PGC_extra;
2364             }
2365         }
2366         if ( assign_pages(d, pg, order, memflags) )
2367         {
2368             free_heap_pages(pg, order, memflags & MEMF_no_scrub);
2369             return NULL;
2370         }
2371     }
2372 
2373     return pg;
2374 }
2375 
free_domheap_pages(struct page_info * pg,unsigned int order)2376 void free_domheap_pages(struct page_info *pg, unsigned int order)
2377 {
2378     struct domain *d = page_get_owner(pg);
2379     unsigned int i;
2380     bool drop_dom_ref;
2381 
2382     ASSERT(!in_irq());
2383 
2384     if ( unlikely(is_xen_heap_page(pg)) )
2385     {
2386         /* NB. May recursively lock from relinquish_memory(). */
2387         spin_lock_recursive(&d->page_alloc_lock);
2388 
2389         for ( i = 0; i < (1 << order); i++ )
2390             arch_free_heap_page(d, &pg[i]);
2391 
2392         d->xenheap_pages -= 1 << order;
2393         drop_dom_ref = (d->xenheap_pages == 0);
2394 
2395         spin_unlock_recursive(&d->page_alloc_lock);
2396     }
2397     else
2398     {
2399         bool scrub;
2400 
2401         if ( likely(d) && likely(d != dom_cow) )
2402         {
2403             /* NB. May recursively lock from relinquish_memory(). */
2404             spin_lock_recursive(&d->page_alloc_lock);
2405 
2406             for ( i = 0; i < (1 << order); i++ )
2407             {
2408                 if ( pg[i].u.inuse.type_info & PGT_count_mask )
2409                 {
2410                     printk(XENLOG_ERR
2411                            "pg[%u] MFN %"PRI_mfn" c=%#lx o=%u v=%#lx t=%#x\n",
2412                            i, mfn_x(page_to_mfn(pg + i)),
2413                            pg[i].count_info, pg[i].v.free.order,
2414                            pg[i].u.free.val, pg[i].tlbflush_timestamp);
2415                     BUG();
2416                 }
2417                 arch_free_heap_page(d, &pg[i]);
2418                 if ( pg[i].count_info & PGC_extra )
2419                 {
2420                     ASSERT(d->extra_pages);
2421                     d->extra_pages--;
2422                 }
2423             }
2424 
2425             drop_dom_ref = !domain_adjust_tot_pages(d, -(1 << order));
2426 
2427             spin_unlock_recursive(&d->page_alloc_lock);
2428 
2429             /*
2430              * Normally we expect a domain to clear pages before freeing them,
2431              * if it cares about the secrecy of their contents. However, after
2432              * a domain has died we assume responsibility for erasure. We do
2433              * scrub regardless if option scrub_domheap is set.
2434              */
2435             scrub = d->is_dying || scrub_debug || opt_scrub_domheap;
2436         }
2437         else
2438         {
2439             /*
2440              * All we need to check is that on dom_cow only order-0 chunks
2441              * make it here. Due to the if() above, the only two possible
2442              * cases right now are d == NULL and d == dom_cow. To protect
2443              * against relaxation of that if() condition without updating the
2444              * check here, don't check d != dom_cow for now.
2445              */
2446             ASSERT(!d || !order);
2447             drop_dom_ref = false;
2448             scrub = 1;
2449         }
2450 
2451         free_heap_pages(pg, order, scrub);
2452     }
2453 
2454     if ( drop_dom_ref )
2455         put_domain(d);
2456 }
2457 
avail_domheap_pages_region(unsigned int node,unsigned int min_width,unsigned int max_width)2458 unsigned long avail_domheap_pages_region(
2459     unsigned int node, unsigned int min_width, unsigned int max_width)
2460 {
2461     int zone_lo, zone_hi;
2462 
2463     zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
2464     zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
2465 
2466     zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
2467     zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
2468 
2469     return avail_heap_pages(zone_lo, zone_hi, node);
2470 }
2471 
avail_domheap_pages(void)2472 unsigned long avail_domheap_pages(void)
2473 {
2474     return avail_heap_pages(MEMZONE_XEN + 1,
2475                             NR_ZONES - 1,
2476                             -1);
2477 }
2478 
avail_node_heap_pages(unsigned int nodeid)2479 unsigned long avail_node_heap_pages(unsigned int nodeid)
2480 {
2481     return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
2482 }
2483 
2484 
pagealloc_info(unsigned char key)2485 static void pagealloc_info(unsigned char key)
2486 {
2487     unsigned int zone = MEMZONE_XEN;
2488     unsigned long n, total = 0;
2489 
2490     printk("Physical memory information:\n");
2491     printk("    Xen heap: %lukB free\n",
2492            avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
2493 
2494     while ( ++zone < NR_ZONES )
2495     {
2496         if ( (zone + PAGE_SHIFT) == dma_bitsize )
2497         {
2498             printk("    DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
2499             total = 0;
2500         }
2501 
2502         if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
2503         {
2504             total += n;
2505             printk("    heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
2506         }
2507     }
2508 
2509     printk("    Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
2510 }
2511 
pagealloc_keyhandler_init(void)2512 static __init int pagealloc_keyhandler_init(void)
2513 {
2514     register_keyhandler('m', pagealloc_info, "memory info", 1);
2515     return 0;
2516 }
2517 __initcall(pagealloc_keyhandler_init);
2518 
2519 
scrub_one_page(struct page_info * pg)2520 void scrub_one_page(struct page_info *pg)
2521 {
2522     if ( unlikely(pg->count_info & PGC_broken) )
2523         return;
2524 
2525 #ifndef NDEBUG
2526     /* Avoid callers relying on allocations returning zeroed pages. */
2527     unmap_domain_page(memset(__map_domain_page(pg),
2528                              SCRUB_BYTE_PATTERN, PAGE_SIZE));
2529 #else
2530     /* For a production build, clear_page() is the fastest way to scrub. */
2531     clear_domain_page(_mfn(page_to_mfn(pg)));
2532 #endif
2533 }
2534 
dump_heap(unsigned char key)2535 static void dump_heap(unsigned char key)
2536 {
2537     s_time_t      now = NOW();
2538     int           i, j;
2539 
2540     printk("'%c' pressed -> dumping heap info (now = %"PRI_stime")\n", key,
2541            now);
2542 
2543     for ( i = 0; i < MAX_NUMNODES; i++ )
2544     {
2545         if ( !avail[i] )
2546             continue;
2547         for ( j = 0; j < NR_ZONES; j++ )
2548             printk("heap[node=%d][zone=%d] -> %lu pages\n",
2549                    i, j, avail[i][j]);
2550     }
2551 
2552     for ( i = 0; i < MAX_NUMNODES; i++ )
2553     {
2554         if ( !node_need_scrub[i] )
2555             continue;
2556         printk("Node %d has %lu unscrubbed pages\n", i, node_need_scrub[i]);
2557     }
2558 }
2559 
register_heap_trigger(void)2560 static __init int register_heap_trigger(void)
2561 {
2562     register_keyhandler('H', dump_heap, "dump heap info", 1);
2563     return 0;
2564 }
2565 __initcall(register_heap_trigger);
2566 
get_pg_owner(domid_t domid)2567 struct domain *get_pg_owner(domid_t domid)
2568 {
2569     struct domain *pg_owner = NULL, *curr = current->domain;
2570 
2571     if ( likely(domid == DOMID_SELF) )
2572     {
2573         pg_owner = rcu_lock_current_domain();
2574         goto out;
2575     }
2576 
2577     if ( unlikely(domid == curr->domain_id) )
2578     {
2579         gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n");
2580         goto out;
2581     }
2582 
2583     switch ( domid )
2584     {
2585     case DOMID_IO:
2586         pg_owner = rcu_lock_domain(dom_io);
2587         break;
2588 
2589     case DOMID_XEN:
2590         pg_owner = rcu_lock_domain(dom_xen);
2591         break;
2592 
2593     default:
2594         if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
2595             gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid);
2596         break;
2597     }
2598 
2599  out:
2600     return pg_owner;
2601 }
2602 
2603 /*
2604  * Local variables:
2605  * mode: C
2606  * c-file-style: "BSD"
2607  * c-basic-offset: 4
2608  * tab-width: 4
2609  * indent-tabs-mode: nil
2610  * End:
2611  */
2612