1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; If not, see <http://www.gnu.org/licenses/>.
21 */
22
23 /*
24 * In general Xen maintains two pools of memory:
25 *
26 * - Xen heap: Memory which is always mapped (i.e accessible by
27 * virtual address), via a permanent and contiguous
28 * "direct mapping". Macros like va() and pa() are valid
29 * for such memory and it is always permissible to stash
30 * pointers to Xen heap memory in data structures etc.
31 *
32 * Xen heap pages are always anonymous (that is, not tied
33 * or accounted to any particular domain).
34 *
35 * - Dom heap: Memory which must be explicitly mapped, usually
36 * transiently with map_domain_page(), in order to be
37 * used. va() and pa() are not valid for such memory. Care
38 * should be taken when stashing pointers to dom heap
39 * pages that those mappings are permanent (e.g. vmap() or
40 * map_domain_page_global()), it is not safe to stash
41 * transient mappings such as those from map_domain_page()
42 *
43 * Dom heap pages are often tied to a particular domain,
44 * but need not be (passing domain==NULL results in an
45 * anonymous dom heap allocation).
46 *
47 * The exact nature of this split is a (sub)arch decision which can
48 * select one of three main variants:
49 *
50 * CONFIG_SEPARATE_XENHEAP=y
51 *
52 * The xen heap is maintained as an entirely separate heap.
53 *
54 * Arch code arranges for some (perhaps small) amount of physical
55 * memory to be covered by a direct mapping and registers that
56 * memory as the Xen heap (via init_xenheap_pages()) and the
57 * remainder as the dom heap.
58 *
59 * This mode of operation is most commonly used by 32-bit arches
60 * where the virtual address space is insufficient to map all RAM.
61 *
62 * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ALL RAM
63 *
64 * All of RAM is covered by a permanent contiguous mapping and there
65 * is only a single heap.
66 *
67 * Memory allocated from the Xen heap is flagged (in
68 * page_info.count_info) with PGC_xen_heap. Memory allocated from
69 * the Dom heap must still be explicitly mapped before use
70 * (e.g. with map_domain_page) in particular in common code.
71 *
72 * xenheap_max_mfn() should not be called by arch code.
73 *
74 * This mode of operation is most commonly used by 64-bit arches
75 * which have sufficient free virtual address space to permanently
76 * map the largest practical amount RAM currently expected on that
77 * arch.
78 *
79 * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ONLY PARTIAL RAM
80 *
81 * There is a single heap, but only the beginning (up to some
82 * threshold) is covered by a permanent contiguous mapping.
83 *
84 * Memory allocated from the Xen heap is allocated from below the
85 * threshold and flagged with PGC_xen_heap. Memory allocated from
86 * the dom heap is allocated from anywhere in the heap (although it
87 * will prefer to allocate from as high as possible to try and keep
88 * Xen heap suitable memory available).
89 *
90 * Arch code must call xenheap_max_mfn() to signal the limit of the
91 * direct mapping.
92 *
93 * This mode of operation is most commonly used by 64-bit arches
94 * which have a restricted amount of virtual address space available
95 * for a direct map (due to e.g. reservations for other purposes)
96 * such that it is not possible to map all of RAM on systems with
97 * the largest practical amount of RAM currently expected on that
98 * arch.
99 *
100 * Boot Allocator
101 *
102 * In addition to the two primary pools (xen heap and dom heap) a
103 * third "boot allocator" is used at start of day. This is a
104 * simplified allocator which can be used.
105 *
106 * Typically all memory which is destined to be dom heap memory
107 * (which is everything in the CONFIG_SEPARATE_XENHEAP=n
108 * configurations) is first allocated to the boot allocator (with
109 * init_boot_pages()) and is then handed over to the main dom heap in
110 * end_boot_allocator().
111 *
112 * "Contiguous" mappings
113 *
114 * Note that although the above talks about "contiguous" mappings
115 * some architectures implement a scheme ("PDX compression") to
116 * compress unused portions of the machine address space (i.e. large
117 * gaps between distinct banks of memory) in order to avoid creating
118 * enormous frame tables and direct maps which mostly map
119 * nothing. Thus a contiguous mapping may still have distinct
120 * regions within it.
121 */
122
123 #include <xen/init.h>
124 #include <xen/types.h>
125 #include <xen/lib.h>
126 #include <xen/sched.h>
127 #include <xen/spinlock.h>
128 #include <xen/mm.h>
129 #include <xen/param.h>
130 #include <xen/irq.h>
131 #include <xen/softirq.h>
132 #include <xen/domain_page.h>
133 #include <xen/keyhandler.h>
134 #include <xen/perfc.h>
135 #include <xen/pfn.h>
136 #include <xen/numa.h>
137 #include <xen/nodemask.h>
138 #include <xen/event.h>
139 #include <public/sysctl.h>
140 #include <public/sched.h>
141 #include <asm/page.h>
142 #include <asm/numa.h>
143 #include <asm/flushtlb.h>
144 #ifdef CONFIG_X86
145 #include <asm/guest.h>
146 #include <asm/p2m.h>
147 #include <asm/setup.h> /* for highmem_start only */
148 #include <asm/paging.h>
149 #else
150 #define p2m_pod_offline_or_broken_hit(pg) 0
151 #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL)
152 #endif
153
154 /*
155 * Comma-separated list of hexadecimal page numbers containing bad bytes.
156 * e.g. 'badpage=0x3f45,0x8a321'.
157 */
158 static char __initdata opt_badpage[100] = "";
159 string_param("badpage", opt_badpage);
160
161 /*
162 * no-bootscrub -> Free pages are not zeroed during boot.
163 */
164 enum bootscrub_mode {
165 BOOTSCRUB_OFF,
166 BOOTSCRUB_ON,
167 BOOTSCRUB_IDLE,
168 };
169
170 /*
171 * opt_bootscrub should live in the init section, since it's not accessed
172 * afterwards. However at least LLVM assumes there are no side effects of
173 * accessing the variable, and optimizes the condition in init_heap_pages() so
174 * opt_bootscrub is read regardless of the value of system_state:
175 * https://bugs.llvm.org/show_bug.cgi?id=39707
176 */
177 static enum bootscrub_mode __read_mostly opt_bootscrub = BOOTSCRUB_IDLE;
parse_bootscrub_param(const char * s)178 static int __init parse_bootscrub_param(const char *s)
179 {
180 /* Interpret 'bootscrub' alone in its positive boolean form */
181 if ( *s == '\0' )
182 {
183 opt_bootscrub = BOOTSCRUB_ON;
184 return 0;
185 }
186
187 switch ( parse_bool(s, NULL) )
188 {
189 case 0:
190 opt_bootscrub = BOOTSCRUB_OFF;
191 break;
192
193 case 1:
194 opt_bootscrub = BOOTSCRUB_ON;
195 break;
196
197 default:
198 if ( !strcmp(s, "idle") )
199 opt_bootscrub = BOOTSCRUB_IDLE;
200 else
201 return -EINVAL;
202 break;
203 }
204
205 return 0;
206 }
207 custom_param("bootscrub", parse_bootscrub_param);
208
209 /*
210 * bootscrub_chunk -> Amount of bytes to scrub lockstep on non-SMT CPUs
211 * on all NUMA nodes.
212 */
213 static unsigned long __initdata opt_bootscrub_chunk = MB(128);
214 size_param("bootscrub_chunk", opt_bootscrub_chunk);
215
216 /* scrub-domheap -> Domheap pages are scrubbed when freed */
217 static bool __read_mostly opt_scrub_domheap;
218 boolean_param("scrub-domheap", opt_scrub_domheap);
219
220 #ifdef CONFIG_SCRUB_DEBUG
221 static bool __read_mostly scrub_debug;
222 #else
223 #define scrub_debug false
224 #endif
225
226 /*
227 * Bit width of the DMA heap -- used to override NUMA-node-first.
228 * allocation strategy, which can otherwise exhaust low memory.
229 */
230 static unsigned int dma_bitsize;
231 integer_param("dma_bits", dma_bitsize);
232
233 /* Offlined page list, protected by heap_lock. */
234 PAGE_LIST_HEAD(page_offlined_list);
235 /* Broken page list, protected by heap_lock. */
236 PAGE_LIST_HEAD(page_broken_list);
237
238 /*************************
239 * BOOT-TIME ALLOCATOR
240 */
241
242 /*
243 * first_valid_mfn is exported because it is use in ARM specific NUMA
244 * helpers. See comment in asm-arm/numa.h.
245 */
246 mfn_t first_valid_mfn = INVALID_MFN_INITIALIZER;
247
248 struct bootmem_region {
249 unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
250 };
251 /* Statically allocate a page for bootmem_region_list. */
252 static struct bootmem_region __initdata
253 bootmem_region_list[PAGE_SIZE / sizeof(struct bootmem_region)];
254 static unsigned int __initdata nr_bootmem_regions;
255
256 struct scrub_region {
257 unsigned long offset;
258 unsigned long start;
259 unsigned long per_cpu_sz;
260 unsigned long rem;
261 cpumask_t cpus;
262 };
263 static struct scrub_region __initdata region[MAX_NUMNODES];
264 static unsigned long __initdata chunk_size;
265
bootmem_region_add(unsigned long s,unsigned long e)266 static void __init bootmem_region_add(unsigned long s, unsigned long e)
267 {
268 unsigned int i;
269
270 if ( s >= e )
271 return;
272
273 for ( i = 0; i < nr_bootmem_regions; i++ )
274 if ( s < bootmem_region_list[i].e )
275 break;
276
277 BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
278 BUG_ON(nr_bootmem_regions == (PAGE_SIZE / sizeof(struct bootmem_region)));
279
280 memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
281 (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
282 bootmem_region_list[i] = (struct bootmem_region) { s, e };
283 nr_bootmem_regions++;
284 }
285
bootmem_region_zap(unsigned long s,unsigned long e)286 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
287 {
288 unsigned int i;
289
290 for ( i = 0; i < nr_bootmem_regions; i++ )
291 {
292 struct bootmem_region *r = &bootmem_region_list[i];
293 if ( e <= r->s )
294 break;
295 if ( s >= r->e )
296 continue;
297 if ( s <= r->s )
298 {
299 r->s = min(e, r->e);
300 }
301 else if ( e >= r->e )
302 {
303 r->e = s;
304 }
305 else
306 {
307 unsigned long _e = r->e;
308 r->e = s;
309 bootmem_region_add(e, _e);
310 }
311 }
312 }
313
init_boot_pages(paddr_t ps,paddr_t pe)314 void __init init_boot_pages(paddr_t ps, paddr_t pe)
315 {
316 unsigned long bad_spfn, bad_epfn;
317 const char *p;
318 #ifdef CONFIG_X86
319 const struct platform_bad_page *badpage;
320 unsigned int i, array_size;
321
322 BUILD_BUG_ON(8 * sizeof(frame_table->u.free.first_dirty) <
323 MAX_ORDER + 1);
324 #endif
325 BUILD_BUG_ON(sizeof(frame_table->u) != sizeof(unsigned long));
326
327 ps = round_pgup(ps);
328 pe = round_pgdown(pe);
329 if ( pe <= ps )
330 return;
331
332 first_valid_mfn = mfn_min(maddr_to_mfn(ps), first_valid_mfn);
333
334 bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
335
336 #ifdef CONFIG_X86
337 /*
338 * Here we put platform-specific memory range workarounds, i.e.
339 * memory known to be corrupt or otherwise in need to be reserved on
340 * specific platforms.
341 * We get these certain pages and remove them from memory region list.
342 */
343 badpage = get_platform_badpages(&array_size);
344 if ( badpage )
345 {
346 for ( i = 0; i < array_size; i++ )
347 {
348 bootmem_region_zap(badpage->mfn,
349 badpage->mfn + (1UL << badpage->order));
350 badpage++;
351 }
352 }
353
354 if ( pv_shim )
355 {
356 badpage = pv_shim_reserved_pages(&array_size);
357 if ( badpage )
358 {
359 for ( i = 0; i < array_size; i++ )
360 {
361 bootmem_region_zap(badpage->mfn,
362 badpage->mfn + (1UL << badpage->order));
363 badpage++;
364 }
365 }
366 }
367 #endif
368
369 /* Check new pages against the bad-page list. */
370 p = opt_badpage;
371 while ( *p != '\0' )
372 {
373 bad_spfn = simple_strtoul(p, &p, 0);
374 bad_epfn = bad_spfn;
375
376 if ( *p == '-' )
377 {
378 p++;
379 bad_epfn = simple_strtoul(p, &p, 0);
380 if ( bad_epfn < bad_spfn )
381 bad_epfn = bad_spfn;
382 }
383
384 if ( *p == ',' )
385 p++;
386 else if ( *p != '\0' )
387 break;
388
389 bootmem_region_zap(bad_spfn, bad_epfn+1);
390 }
391 }
392
alloc_boot_pages(unsigned long nr_pfns,unsigned long pfn_align)393 mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
394 {
395 unsigned long pg, _e;
396 unsigned int i = nr_bootmem_regions;
397
398 BUG_ON(!nr_bootmem_regions);
399
400 while ( i-- )
401 {
402 struct bootmem_region *r = &bootmem_region_list[i];
403
404 pg = (r->e - nr_pfns) & ~(pfn_align - 1);
405 if ( pg >= r->e || pg < r->s )
406 continue;
407
408 #if defined(CONFIG_X86) && !defined(NDEBUG)
409 /*
410 * Filtering pfn_align == 1 since the only allocations using a bigger
411 * alignment are the ones used for setting up the frame table chunks.
412 * Those allocations get remapped anyway, i.e. them not having 1:1
413 * mappings always accessible is not a problem.
414 */
415 if ( highmem_start && pfn_align == 1 &&
416 r->e > PFN_DOWN(highmem_start) )
417 {
418 pg = r->s;
419 if ( pg + nr_pfns > PFN_DOWN(highmem_start) )
420 continue;
421 r->s = pg + nr_pfns;
422 return _mfn(pg);
423 }
424 #endif
425
426 _e = r->e;
427 r->e = pg;
428 bootmem_region_add(pg + nr_pfns, _e);
429 return _mfn(pg);
430 }
431
432 BUG();
433 }
434
435
436
437 /*************************
438 * BINARY BUDDY ALLOCATOR
439 */
440
441 #define MEMZONE_XEN 0
442 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT + 1)
443
444 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
445 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
446 (flsl(mfn_x(page_to_mfn(pg))) ? : 1))
447
448 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
449 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
450 #define heap(node, zone, order) ((*_heap[node])[zone][order])
451
452 static unsigned long node_need_scrub[MAX_NUMNODES];
453
454 static unsigned long *avail[MAX_NUMNODES];
455 static long total_avail_pages;
456
457 static DEFINE_SPINLOCK(heap_lock);
458 static long outstanding_claims; /* total outstanding claims by all domains */
459
domain_adjust_tot_pages(struct domain * d,long pages)460 unsigned long domain_adjust_tot_pages(struct domain *d, long pages)
461 {
462 long dom_before, dom_after, dom_claimed, sys_before, sys_after;
463
464 ASSERT(spin_is_locked(&d->page_alloc_lock));
465 d->tot_pages += pages;
466
467 /*
468 * can test d->claimed_pages race-free because it can only change
469 * if d->page_alloc_lock and heap_lock are both held, see also
470 * domain_set_outstanding_pages below
471 */
472 if ( !d->outstanding_pages )
473 goto out;
474
475 spin_lock(&heap_lock);
476 /* adjust domain outstanding pages; may not go negative */
477 dom_before = d->outstanding_pages;
478 dom_after = dom_before - pages;
479 BUG_ON(dom_before < 0);
480 dom_claimed = dom_after < 0 ? 0 : dom_after;
481 d->outstanding_pages = dom_claimed;
482 /* flag accounting bug if system outstanding_claims would go negative */
483 sys_before = outstanding_claims;
484 sys_after = sys_before - (dom_before - dom_claimed);
485 BUG_ON(sys_after < 0);
486 outstanding_claims = sys_after;
487 spin_unlock(&heap_lock);
488
489 out:
490 return d->tot_pages;
491 }
492
domain_set_outstanding_pages(struct domain * d,unsigned long pages)493 int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
494 {
495 int ret = -ENOMEM;
496 unsigned long claim, avail_pages;
497
498 /*
499 * take the domain's page_alloc_lock, else all d->tot_page adjustments
500 * must always take the global heap_lock rather than only in the much
501 * rarer case that d->outstanding_pages is non-zero
502 */
503 spin_lock(&d->page_alloc_lock);
504 spin_lock(&heap_lock);
505
506 /* pages==0 means "unset" the claim. */
507 if ( pages == 0 )
508 {
509 outstanding_claims -= d->outstanding_pages;
510 d->outstanding_pages = 0;
511 ret = 0;
512 goto out;
513 }
514
515 /* only one active claim per domain please */
516 if ( d->outstanding_pages )
517 {
518 ret = -EINVAL;
519 goto out;
520 }
521
522 /* disallow a claim not exceeding domain_tot_pages() or above max_pages */
523 if ( (pages <= domain_tot_pages(d)) || (pages > d->max_pages) )
524 {
525 ret = -EINVAL;
526 goto out;
527 }
528
529 /* how much memory is available? */
530 avail_pages = total_avail_pages;
531
532 avail_pages -= outstanding_claims;
533
534 /*
535 * Note, if domain has already allocated memory before making a claim
536 * then the claim must take domain_tot_pages() into account
537 */
538 claim = pages - domain_tot_pages(d);
539 if ( claim > avail_pages )
540 goto out;
541
542 /* yay, claim fits in available memory, stake the claim, success! */
543 d->outstanding_pages = claim;
544 outstanding_claims += d->outstanding_pages;
545 ret = 0;
546
547 out:
548 spin_unlock(&heap_lock);
549 spin_unlock(&d->page_alloc_lock);
550 return ret;
551 }
552
get_outstanding_claims(uint64_t * free_pages,uint64_t * outstanding_pages)553 void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages)
554 {
555 spin_lock(&heap_lock);
556 *outstanding_pages = outstanding_claims;
557 *free_pages = avail_domheap_pages();
558 spin_unlock(&heap_lock);
559 }
560
561 static bool __read_mostly first_node_initialised;
562 #ifndef CONFIG_SEPARATE_XENHEAP
563 static unsigned int __read_mostly xenheap_bits;
564 #else
565 #define xenheap_bits 0
566 #endif
567
init_node_heap(int node,unsigned long mfn,unsigned long nr,bool * use_tail)568 static unsigned long init_node_heap(int node, unsigned long mfn,
569 unsigned long nr, bool *use_tail)
570 {
571 /* First node to be discovered has its heap metadata statically alloced. */
572 static heap_by_zone_and_order_t _heap_static;
573 static unsigned long avail_static[NR_ZONES];
574 unsigned long needed = (sizeof(**_heap) +
575 sizeof(**avail) * NR_ZONES +
576 PAGE_SIZE - 1) >> PAGE_SHIFT;
577 int i, j;
578
579 if ( !first_node_initialised )
580 {
581 _heap[node] = &_heap_static;
582 avail[node] = avail_static;
583 first_node_initialised = true;
584 needed = 0;
585 }
586 else if ( *use_tail && nr >= needed &&
587 arch_mfn_in_directmap(mfn + nr) &&
588 (!xenheap_bits ||
589 !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
590 {
591 _heap[node] = mfn_to_virt(mfn + nr - needed);
592 avail[node] = mfn_to_virt(mfn + nr - 1) +
593 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
594 }
595 else if ( nr >= needed &&
596 arch_mfn_in_directmap(mfn + needed) &&
597 (!xenheap_bits ||
598 !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
599 {
600 _heap[node] = mfn_to_virt(mfn);
601 avail[node] = mfn_to_virt(mfn + needed - 1) +
602 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
603 *use_tail = false;
604 }
605 else if ( get_order_from_bytes(sizeof(**_heap)) ==
606 get_order_from_pages(needed) )
607 {
608 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
609 BUG_ON(!_heap[node]);
610 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
611 sizeof(**avail) * NR_ZONES;
612 needed = 0;
613 }
614 else
615 {
616 _heap[node] = xmalloc(heap_by_zone_and_order_t);
617 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
618 BUG_ON(!_heap[node] || !avail[node]);
619 needed = 0;
620 }
621
622 memset(avail[node], 0, NR_ZONES * sizeof(long));
623
624 for ( i = 0; i < NR_ZONES; i++ )
625 for ( j = 0; j <= MAX_ORDER; j++ )
626 INIT_PAGE_LIST_HEAD(&heap(node, i, j));
627
628 return needed;
629 }
630
631 /* Default to 64 MiB */
632 #define DEFAULT_LOW_MEM_VIRQ (((paddr_t) 64) << 20)
633 #define MAX_LOW_MEM_VIRQ (((paddr_t) 1024) << 20)
634
635 static paddr_t __read_mostly opt_low_mem_virq = ((paddr_t) -1);
636 size_param("low_mem_virq_limit", opt_low_mem_virq);
637
638 /* Thresholds to control hysteresis. In pages */
639 /* When memory grows above this threshold, reset hysteresis.
640 * -1 initially to not reset until at least one virq issued. */
641 static unsigned long low_mem_virq_high = -1UL;
642 /* Threshold at which we issue virq */
643 static unsigned long low_mem_virq_th = 0;
644 /* Original threshold after all checks completed */
645 static unsigned long low_mem_virq_orig = 0;
646 /* Order for current threshold */
647 static unsigned int low_mem_virq_th_order = 0;
648
649 /* Perform bootstrapping checks and set bounds */
setup_low_mem_virq(void)650 static void __init setup_low_mem_virq(void)
651 {
652 unsigned int order;
653 paddr_t threshold;
654 bool halve;
655
656 /* If the user specifies zero, then he/she doesn't want this virq
657 * to ever trigger. */
658 if ( opt_low_mem_virq == 0 )
659 {
660 low_mem_virq_th = -1UL;
661 return;
662 }
663
664 /* If the user did not specify a knob, remember that */
665 halve = (opt_low_mem_virq == ((paddr_t) -1));
666 threshold = halve ? DEFAULT_LOW_MEM_VIRQ : opt_low_mem_virq;
667
668 /* Dom0 has already been allocated by now. So check we won't be
669 * complaining immediately with whatever's left of the heap. */
670 threshold = min(threshold,
671 ((paddr_t) total_avail_pages) << PAGE_SHIFT);
672
673 /* Then, cap to some predefined maximum */
674 threshold = min(threshold, MAX_LOW_MEM_VIRQ);
675
676 /* If the user specified no knob, and we are at the current available
677 * level, halve the threshold. */
678 if ( halve &&
679 (threshold == (((paddr_t) total_avail_pages) << PAGE_SHIFT)) )
680 threshold >>= 1;
681
682 /* Zero? Have to fire immediately */
683 threshold = max(threshold, (paddr_t) PAGE_SIZE);
684
685 /* Threshold bytes -> pages */
686 low_mem_virq_th = threshold >> PAGE_SHIFT;
687
688 /* Next, round the threshold down to the next order */
689 order = get_order_from_pages(low_mem_virq_th);
690 if ( (1UL << order) > low_mem_virq_th )
691 order--;
692
693 /* Set bounds, ready to go */
694 low_mem_virq_th = low_mem_virq_orig = 1UL << order;
695 low_mem_virq_th_order = order;
696
697 printk("Initial low memory virq threshold set at %#lx pages.\n",
698 low_mem_virq_th);
699 }
700
check_low_mem_virq(void)701 static void check_low_mem_virq(void)
702 {
703 unsigned long avail_pages = total_avail_pages - outstanding_claims;
704
705 if ( unlikely(avail_pages <= low_mem_virq_th) )
706 {
707 send_global_virq(VIRQ_ENOMEM);
708
709 /* Update thresholds. Next warning will be when we drop below
710 * next order. However, we wait until we grow beyond one
711 * order above us to complain again at the current order */
712 low_mem_virq_high = 1UL << (low_mem_virq_th_order + 1);
713 if ( low_mem_virq_th_order > 0 )
714 low_mem_virq_th_order--;
715 low_mem_virq_th = 1UL << low_mem_virq_th_order;
716 return;
717 }
718
719 if ( unlikely(avail_pages >= low_mem_virq_high) )
720 {
721 /* Reset hysteresis. Bring threshold up one order.
722 * If we are back where originally set, set high
723 * threshold to -1 to avoid further growth of
724 * virq threshold. */
725 low_mem_virq_th_order++;
726 low_mem_virq_th = 1UL << low_mem_virq_th_order;
727 if ( low_mem_virq_th == low_mem_virq_orig )
728 low_mem_virq_high = -1UL;
729 else
730 low_mem_virq_high = 1UL << (low_mem_virq_th_order + 2);
731 }
732 }
733
734 /* Pages that need a scrub are added to tail, otherwise to head. */
page_list_add_scrub(struct page_info * pg,unsigned int node,unsigned int zone,unsigned int order,unsigned int first_dirty)735 static void page_list_add_scrub(struct page_info *pg, unsigned int node,
736 unsigned int zone, unsigned int order,
737 unsigned int first_dirty)
738 {
739 PFN_ORDER(pg) = order;
740 pg->u.free.first_dirty = first_dirty;
741 pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
742
743 if ( first_dirty != INVALID_DIRTY_IDX )
744 {
745 ASSERT(first_dirty < (1U << order));
746 page_list_add_tail(pg, &heap(node, zone, order));
747 }
748 else
749 page_list_add(pg, &heap(node, zone, order));
750 }
751
752 /* SCRUB_PATTERN needs to be a repeating series of bytes. */
753 #ifndef NDEBUG
754 #define SCRUB_PATTERN 0xc2c2c2c2c2c2c2c2ULL
755 #else
756 #define SCRUB_PATTERN 0ULL
757 #endif
758 #define SCRUB_BYTE_PATTERN (SCRUB_PATTERN & 0xff)
759
poison_one_page(struct page_info * pg)760 static void poison_one_page(struct page_info *pg)
761 {
762 #ifdef CONFIG_SCRUB_DEBUG
763 uint64_t *ptr;
764
765 if ( !scrub_debug )
766 return;
767
768 ptr = __map_domain_page(pg);
769 *ptr = ~SCRUB_PATTERN;
770 unmap_domain_page(ptr);
771 #endif
772 }
773
check_one_page(struct page_info * pg)774 static void check_one_page(struct page_info *pg)
775 {
776 #ifdef CONFIG_SCRUB_DEBUG
777 const uint64_t *ptr;
778 unsigned int i;
779
780 if ( !scrub_debug )
781 return;
782
783 ptr = __map_domain_page(pg);
784 for ( i = 0; i < PAGE_SIZE / sizeof (*ptr); i++ )
785 BUG_ON(ptr[i] != SCRUB_PATTERN);
786 unmap_domain_page(ptr);
787 #endif
788 }
789
check_and_stop_scrub(struct page_info * head)790 static void check_and_stop_scrub(struct page_info *head)
791 {
792 if ( head->u.free.scrub_state == BUDDY_SCRUBBING )
793 {
794 typeof(head->u.free) pgfree;
795
796 head->u.free.scrub_state = BUDDY_SCRUB_ABORT;
797 spin_lock_kick();
798 for ( ; ; )
799 {
800 /* Can't ACCESS_ONCE() a bitfield. */
801 pgfree.val = ACCESS_ONCE(head->u.free.val);
802 if ( pgfree.scrub_state != BUDDY_SCRUB_ABORT )
803 break;
804 cpu_relax();
805 }
806 }
807 }
808
get_free_buddy(unsigned int zone_lo,unsigned int zone_hi,unsigned int order,unsigned int memflags,const struct domain * d)809 static struct page_info *get_free_buddy(unsigned int zone_lo,
810 unsigned int zone_hi,
811 unsigned int order, unsigned int memflags,
812 const struct domain *d)
813 {
814 nodeid_t first, node = MEMF_get_node(memflags), req_node = node;
815 nodemask_t nodemask = node_online_map;
816 unsigned int j, zone, nodemask_retry = 0;
817 struct page_info *pg;
818 bool use_unscrubbed = (memflags & MEMF_no_scrub);
819
820 /*
821 * d->node_affinity is our preferred allocation set if provided, but it
822 * may have bits set outside of node_online_map. Clamp it.
823 */
824 if ( d )
825 {
826 /*
827 * It is the callers responsibility to ensure that d->node_affinity
828 * isn't complete junk.
829 */
830 if ( nodes_intersects(nodemask, d->node_affinity) )
831 nodes_and(nodemask, nodemask, d->node_affinity);
832 else
833 ASSERT_UNREACHABLE();
834 }
835
836 if ( node == NUMA_NO_NODE )
837 {
838 if ( d != NULL )
839 node = cycle_node(d->last_alloc_node, nodemask);
840
841 if ( node >= MAX_NUMNODES )
842 node = cpu_to_node(smp_processor_id());
843 }
844 else if ( unlikely(node >= MAX_NUMNODES) )
845 {
846 ASSERT_UNREACHABLE();
847 return NULL;
848 }
849 first = node;
850
851 /*
852 * Start with requested node, but exhaust all node memory in requested
853 * zone before failing, only calc new node value if we fail to find memory
854 * in target node, this avoids needless computation on fast-path.
855 */
856 for ( ; ; )
857 {
858 zone = zone_hi;
859 do {
860 /* Check if target node can support the allocation. */
861 if ( !avail[node] || (avail[node][zone] < (1UL << order)) )
862 continue;
863
864 /* Find smallest order which can satisfy the request. */
865 for ( j = order; j <= MAX_ORDER; j++ )
866 {
867 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
868 {
869 if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
870 return pg;
871 /*
872 * We grab single pages (order=0) even if they are
873 * unscrubbed. Given that scrubbing one page is fairly quick
874 * it is not worth breaking higher orders.
875 */
876 if ( (order == 0) || use_unscrubbed )
877 {
878 check_and_stop_scrub(pg);
879 return pg;
880 }
881
882 page_list_add_tail(pg, &heap(node, zone, j));
883 }
884 }
885 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
886
887 if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
888 return NULL;
889
890 /* Pick next node. */
891 if ( !nodemask_test(node, &nodemask) )
892 {
893 /* Very first node may be caller-specified and outside nodemask. */
894 ASSERT(!nodemask_retry);
895 first = node = first_node(nodemask);
896 if ( node < MAX_NUMNODES )
897 continue;
898 }
899 else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
900 node = first_node(nodemask);
901 if ( node == first )
902 {
903 /* When we have tried all in nodemask, we fall back to others. */
904 if ( (memflags & MEMF_exact_node) || nodemask_retry++ )
905 return NULL;
906 nodes_andnot(nodemask, node_online_map, nodemask);
907 first = node = first_node(nodemask);
908 if ( node >= MAX_NUMNODES )
909 return NULL;
910 }
911 }
912 }
913
914 /* Allocate 2^@order contiguous pages. */
alloc_heap_pages(unsigned int zone_lo,unsigned int zone_hi,unsigned int order,unsigned int memflags,struct domain * d)915 static struct page_info *alloc_heap_pages(
916 unsigned int zone_lo, unsigned int zone_hi,
917 unsigned int order, unsigned int memflags,
918 struct domain *d)
919 {
920 nodeid_t node;
921 unsigned int i, buddy_order, zone, first_dirty;
922 unsigned long request = 1UL << order;
923 struct page_info *pg;
924 bool need_tlbflush = false;
925 uint32_t tlbflush_timestamp = 0;
926 unsigned int dirty_cnt = 0;
927
928 /* Make sure there are enough bits in memflags for nodeID. */
929 BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
930
931 ASSERT(zone_lo <= zone_hi);
932 ASSERT(zone_hi < NR_ZONES);
933
934 if ( unlikely(order > MAX_ORDER) )
935 return NULL;
936
937 spin_lock(&heap_lock);
938
939 /*
940 * Claimed memory is considered unavailable unless the request
941 * is made by a domain with sufficient unclaimed pages.
942 */
943 if ( (outstanding_claims + request > total_avail_pages) &&
944 ((memflags & MEMF_no_refcount) ||
945 !d || d->outstanding_pages < request) )
946 {
947 spin_unlock(&heap_lock);
948 return NULL;
949 }
950
951 pg = get_free_buddy(zone_lo, zone_hi, order, memflags, d);
952 /* Try getting a dirty buddy if we couldn't get a clean one. */
953 if ( !pg && !(memflags & MEMF_no_scrub) )
954 pg = get_free_buddy(zone_lo, zone_hi, order,
955 memflags | MEMF_no_scrub, d);
956 if ( !pg )
957 {
958 /* No suitable memory blocks. Fail the request. */
959 spin_unlock(&heap_lock);
960 return NULL;
961 }
962
963 node = phys_to_nid(page_to_maddr(pg));
964 zone = page_to_zone(pg);
965 buddy_order = PFN_ORDER(pg);
966
967 first_dirty = pg->u.free.first_dirty;
968
969 /* We may have to halve the chunk a number of times. */
970 while ( buddy_order != order )
971 {
972 buddy_order--;
973 page_list_add_scrub(pg, node, zone, buddy_order,
974 (1U << buddy_order) > first_dirty ?
975 first_dirty : INVALID_DIRTY_IDX);
976 pg += 1U << buddy_order;
977
978 if ( first_dirty != INVALID_DIRTY_IDX )
979 {
980 /* Adjust first_dirty */
981 if ( first_dirty >= 1U << buddy_order )
982 first_dirty -= 1U << buddy_order;
983 else
984 first_dirty = 0; /* We've moved past original first_dirty */
985 }
986 }
987
988 ASSERT(avail[node][zone] >= request);
989 avail[node][zone] -= request;
990 total_avail_pages -= request;
991 ASSERT(total_avail_pages >= 0);
992
993 check_low_mem_virq();
994
995 if ( d != NULL )
996 d->last_alloc_node = node;
997
998 for ( i = 0; i < (1 << order); i++ )
999 {
1000 /* Reference count must continuously be zero for free pages. */
1001 if ( (pg[i].count_info & ~PGC_need_scrub) != PGC_state_free )
1002 {
1003 printk(XENLOG_ERR
1004 "pg[%u] MFN %"PRI_mfn" c=%#lx o=%u v=%#lx t=%#x\n",
1005 i, mfn_x(page_to_mfn(pg + i)),
1006 pg[i].count_info, pg[i].v.free.order,
1007 pg[i].u.free.val, pg[i].tlbflush_timestamp);
1008 BUG();
1009 }
1010
1011 /* PGC_need_scrub can only be set if first_dirty is valid */
1012 ASSERT(first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub));
1013
1014 /* Preserve PGC_need_scrub so we can check it after lock is dropped. */
1015 pg[i].count_info = PGC_state_inuse | (pg[i].count_info & PGC_need_scrub);
1016
1017 if ( !(memflags & MEMF_no_tlbflush) )
1018 accumulate_tlbflush(&need_tlbflush, &pg[i],
1019 &tlbflush_timestamp);
1020
1021 /* Initialise fields which have other uses for free pages. */
1022 pg[i].u.inuse.type_info = 0;
1023 page_set_owner(&pg[i], NULL);
1024
1025 /* Ensure cache and RAM are consistent for platforms where the
1026 * guest can control its own visibility of/through the cache.
1027 */
1028 flush_page_to_ram(mfn_x(page_to_mfn(&pg[i])),
1029 !(memflags & MEMF_no_icache_flush));
1030 }
1031
1032 spin_unlock(&heap_lock);
1033
1034 if ( first_dirty != INVALID_DIRTY_IDX ||
1035 (scrub_debug && !(memflags & MEMF_no_scrub)) )
1036 {
1037 for ( i = 0; i < (1U << order); i++ )
1038 {
1039 if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
1040 {
1041 if ( !(memflags & MEMF_no_scrub) )
1042 scrub_one_page(&pg[i]);
1043
1044 dirty_cnt++;
1045
1046 spin_lock(&heap_lock);
1047 pg[i].count_info &= ~PGC_need_scrub;
1048 spin_unlock(&heap_lock);
1049 }
1050 else if ( !(memflags & MEMF_no_scrub) )
1051 check_one_page(&pg[i]);
1052 }
1053
1054 if ( dirty_cnt )
1055 {
1056 spin_lock(&heap_lock);
1057 node_need_scrub[node] -= dirty_cnt;
1058 spin_unlock(&heap_lock);
1059 }
1060 }
1061
1062 if ( need_tlbflush )
1063 filtered_flush_tlb_mask(tlbflush_timestamp);
1064
1065 return pg;
1066 }
1067
1068 /* Remove any offlined page in the buddy pointed to by head. */
reserve_offlined_page(struct page_info * head)1069 static int reserve_offlined_page(struct page_info *head)
1070 {
1071 unsigned int node = phys_to_nid(page_to_maddr(head));
1072 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
1073 struct page_info *cur_head;
1074 unsigned int cur_order, first_dirty;
1075
1076 ASSERT(spin_is_locked(&heap_lock));
1077
1078 cur_head = head;
1079
1080 check_and_stop_scrub(head);
1081 /*
1082 * We may break the buddy so let's mark the head as clean. Then, when
1083 * merging chunks back into the heap, we will see whether the chunk has
1084 * unscrubbed pages and set its first_dirty properly.
1085 */
1086 first_dirty = head->u.free.first_dirty;
1087 head->u.free.first_dirty = INVALID_DIRTY_IDX;
1088
1089 page_list_del(head, &heap(node, zone, head_order));
1090
1091 while ( cur_head < (head + (1 << head_order)) )
1092 {
1093 struct page_info *pg;
1094 int next_order;
1095
1096 if ( page_state_is(cur_head, offlined) )
1097 {
1098 cur_head++;
1099 if ( first_dirty != INVALID_DIRTY_IDX && first_dirty )
1100 first_dirty--;
1101 continue;
1102 }
1103
1104 next_order = cur_order = 0;
1105
1106 while ( cur_order < head_order )
1107 {
1108 next_order = cur_order + 1;
1109
1110 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
1111 goto merge;
1112
1113 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
1114 i < (1 << next_order);
1115 i++, pg++ )
1116 if ( page_state_is(pg, offlined) )
1117 break;
1118 if ( i == ( 1 << next_order) )
1119 {
1120 cur_order = next_order;
1121 continue;
1122 }
1123 else
1124 {
1125 merge:
1126 /* We don't consider merging outside the head_order. */
1127 page_list_add_scrub(cur_head, node, zone, cur_order,
1128 (1U << cur_order) > first_dirty ?
1129 first_dirty : INVALID_DIRTY_IDX);
1130 cur_head += (1 << cur_order);
1131
1132 /* Adjust first_dirty if needed. */
1133 if ( first_dirty != INVALID_DIRTY_IDX )
1134 {
1135 if ( first_dirty >= 1U << cur_order )
1136 first_dirty -= 1U << cur_order;
1137 else
1138 first_dirty = 0;
1139 }
1140
1141 break;
1142 }
1143 }
1144 }
1145
1146 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
1147 {
1148 if ( !page_state_is(cur_head, offlined) )
1149 continue;
1150
1151 avail[node][zone]--;
1152 total_avail_pages--;
1153 ASSERT(total_avail_pages >= 0);
1154
1155 page_list_add_tail(cur_head,
1156 test_bit(_PGC_broken, &cur_head->count_info) ?
1157 &page_broken_list : &page_offlined_list);
1158
1159 count++;
1160 }
1161
1162 return count;
1163 }
1164
1165 static nodemask_t node_scrubbing;
1166
1167 /*
1168 * If get_node is true this will return closest node that needs to be scrubbed,
1169 * with appropriate bit in node_scrubbing set.
1170 * If get_node is not set, this will return *a* node that needs to be scrubbed.
1171 * node_scrubbing bitmask will no be updated.
1172 * If no node needs scrubbing then NUMA_NO_NODE is returned.
1173 */
node_to_scrub(bool get_node)1174 static unsigned int node_to_scrub(bool get_node)
1175 {
1176 nodeid_t node = cpu_to_node(smp_processor_id()), local_node;
1177 nodeid_t closest = NUMA_NO_NODE;
1178 u8 dist, shortest = 0xff;
1179
1180 if ( node == NUMA_NO_NODE )
1181 node = 0;
1182
1183 if ( node_need_scrub[node] &&
1184 (!get_node || !node_test_and_set(node, node_scrubbing)) )
1185 return node;
1186
1187 /*
1188 * See if there are memory-only nodes that need scrubbing and choose
1189 * the closest one.
1190 */
1191 local_node = node;
1192 for ( ; ; )
1193 {
1194 do {
1195 node = cycle_node(node, node_online_map);
1196 } while ( !cpumask_empty(&node_to_cpumask(node)) &&
1197 (node != local_node) );
1198
1199 if ( node == local_node )
1200 break;
1201
1202 if ( node_need_scrub[node] )
1203 {
1204 if ( !get_node )
1205 return node;
1206
1207 dist = __node_distance(local_node, node);
1208
1209 /*
1210 * Grab the node right away. If we find a closer node later we will
1211 * release this one. While there is a chance that another CPU will
1212 * not be able to scrub that node when it is searching for scrub work
1213 * at the same time it will be able to do so next time it wakes up.
1214 * The alternative would be to perform this search under a lock but
1215 * then we'd need to take this lock every time we come in here.
1216 */
1217 if ( (dist < shortest || closest == NUMA_NO_NODE) &&
1218 !node_test_and_set(node, node_scrubbing) )
1219 {
1220 if ( closest != NUMA_NO_NODE )
1221 node_clear(closest, node_scrubbing);
1222 shortest = dist;
1223 closest = node;
1224 }
1225 }
1226 }
1227
1228 return closest;
1229 }
1230
1231 struct scrub_wait_state {
1232 struct page_info *pg;
1233 unsigned int first_dirty;
1234 bool drop;
1235 };
1236
scrub_continue(void * data)1237 static void scrub_continue(void *data)
1238 {
1239 struct scrub_wait_state *st = data;
1240
1241 if ( st->drop )
1242 return;
1243
1244 if ( st->pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1245 {
1246 /* There is a waiter for this buddy. Release it. */
1247 st->drop = true;
1248 st->pg->u.free.first_dirty = st->first_dirty;
1249 smp_wmb();
1250 st->pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1251 }
1252 }
1253
scrub_free_pages(void)1254 bool scrub_free_pages(void)
1255 {
1256 struct page_info *pg;
1257 unsigned int zone;
1258 unsigned int cpu = smp_processor_id();
1259 bool preempt = false;
1260 nodeid_t node;
1261 unsigned int cnt = 0;
1262
1263 node = node_to_scrub(true);
1264 if ( node == NUMA_NO_NODE )
1265 return false;
1266
1267 spin_lock(&heap_lock);
1268
1269 for ( zone = 0; zone < NR_ZONES; zone++ )
1270 {
1271 unsigned int order = MAX_ORDER;
1272
1273 do {
1274 while ( !page_list_empty(&heap(node, zone, order)) )
1275 {
1276 unsigned int i, dirty_cnt;
1277 struct scrub_wait_state st;
1278
1279 /* Unscrubbed pages are always at the end of the list. */
1280 pg = page_list_last(&heap(node, zone, order));
1281 if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
1282 break;
1283
1284 ASSERT(pg->u.free.scrub_state == BUDDY_NOT_SCRUBBING);
1285 pg->u.free.scrub_state = BUDDY_SCRUBBING;
1286
1287 spin_unlock(&heap_lock);
1288
1289 dirty_cnt = 0;
1290
1291 for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
1292 {
1293 if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
1294 {
1295 scrub_one_page(&pg[i]);
1296 /*
1297 * We can modify count_info without holding heap
1298 * lock since we effectively locked this buddy by
1299 * setting its scrub_state.
1300 */
1301 pg[i].count_info &= ~PGC_need_scrub;
1302 dirty_cnt++;
1303 cnt += 100; /* scrubbed pages add heavier weight. */
1304 }
1305 else
1306 cnt++;
1307
1308 if ( pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1309 {
1310 /* Someone wants this chunk. Drop everything. */
1311
1312 pg->u.free.first_dirty = (i == (1U << order) - 1) ?
1313 INVALID_DIRTY_IDX : i + 1;
1314 smp_wmb();
1315 pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1316
1317 spin_lock(&heap_lock);
1318 node_need_scrub[node] -= dirty_cnt;
1319 spin_unlock(&heap_lock);
1320 goto out_nolock;
1321 }
1322
1323 /*
1324 * Scrub a few (8) pages before becoming eligible for
1325 * preemption. But also count non-scrubbing loop iterations
1326 * so that we don't get stuck here with an almost clean
1327 * heap.
1328 */
1329 if ( cnt > 800 && softirq_pending(cpu) )
1330 {
1331 preempt = true;
1332 break;
1333 }
1334 }
1335
1336 st.pg = pg;
1337 /*
1338 * get_free_buddy() grabs a buddy with first_dirty set to
1339 * INVALID_DIRTY_IDX so we can't set pg's first_dirty here.
1340 * It will be set either below or in the lock callback (in
1341 * scrub_continue()).
1342 */
1343 st.first_dirty = (i >= (1U << order) - 1) ?
1344 INVALID_DIRTY_IDX : i + 1;
1345 st.drop = false;
1346 spin_lock_cb(&heap_lock, scrub_continue, &st);
1347
1348 node_need_scrub[node] -= dirty_cnt;
1349
1350 if ( st.drop )
1351 goto out;
1352
1353 if ( i >= (1U << order) - 1 )
1354 {
1355 page_list_del(pg, &heap(node, zone, order));
1356 page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
1357 }
1358 else
1359 pg->u.free.first_dirty = i + 1;
1360
1361 pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1362
1363 if ( preempt || (node_need_scrub[node] == 0) )
1364 goto out;
1365 }
1366 } while ( order-- != 0 );
1367 }
1368
1369 out:
1370 spin_unlock(&heap_lock);
1371
1372 out_nolock:
1373 node_clear(node, node_scrubbing);
1374 return node_to_scrub(false) != NUMA_NO_NODE;
1375 }
1376
1377 /* Free 2^@order set of pages. */
free_heap_pages(struct page_info * pg,unsigned int order,bool need_scrub)1378 static void free_heap_pages(
1379 struct page_info *pg, unsigned int order, bool need_scrub)
1380 {
1381 unsigned long mask;
1382 mfn_t mfn = page_to_mfn(pg);
1383 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
1384 unsigned int zone = page_to_zone(pg);
1385
1386 ASSERT(order <= MAX_ORDER);
1387 ASSERT(node >= 0);
1388
1389 spin_lock(&heap_lock);
1390
1391 for ( i = 0; i < (1 << order); i++ )
1392 {
1393 /*
1394 * Cannot assume that count_info == 0, as there are some corner cases
1395 * where it isn't the case and yet it isn't a bug:
1396 * 1. page_get_owner() is NULL
1397 * 2. page_get_owner() is a domain that was never accessible by
1398 * its domid (e.g., failed to fully construct the domain).
1399 * 3. page was never addressable by the guest (e.g., it's an
1400 * auto-translate-physmap guest and the page was never included
1401 * in its pseudophysical address space).
1402 * In all the above cases there can be no guest mappings of this page.
1403 */
1404 switch ( pg[i].count_info & PGC_state )
1405 {
1406 case PGC_state_inuse:
1407 BUG_ON(pg[i].count_info & PGC_broken);
1408 pg[i].count_info = PGC_state_free;
1409 break;
1410
1411 case PGC_state_offlining:
1412 pg[i].count_info = (pg[i].count_info & PGC_broken) |
1413 PGC_state_offlined;
1414 tainted = 1;
1415 break;
1416
1417 default:
1418 printk(XENLOG_ERR
1419 "pg[%u] MFN %"PRI_mfn" c=%#lx o=%u v=%#lx t=%#x\n",
1420 i, mfn_x(page_to_mfn(pg + i)),
1421 pg[i].count_info, pg[i].v.free.order,
1422 pg[i].u.free.val, pg[i].tlbflush_timestamp);
1423 BUG();
1424 }
1425
1426 /* If a page has no owner it will need no safety TLB flush. */
1427 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
1428 if ( pg[i].u.free.need_tlbflush )
1429 page_set_tlbflush_timestamp(&pg[i]);
1430
1431 /* This page is not a guest frame any more. */
1432 page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
1433 set_gpfn_from_mfn(mfn_x(mfn) + i, INVALID_M2P_ENTRY);
1434
1435 if ( need_scrub )
1436 {
1437 pg[i].count_info |= PGC_need_scrub;
1438 poison_one_page(&pg[i]);
1439 }
1440 }
1441
1442 avail[node][zone] += 1 << order;
1443 total_avail_pages += 1 << order;
1444 if ( need_scrub )
1445 {
1446 node_need_scrub[node] += 1 << order;
1447 pg->u.free.first_dirty = 0;
1448 }
1449 else
1450 pg->u.free.first_dirty = INVALID_DIRTY_IDX;
1451
1452 /* Merge chunks as far as possible. */
1453 while ( order < MAX_ORDER )
1454 {
1455 mask = 1UL << order;
1456
1457 if ( (mfn_x(page_to_mfn(pg)) & mask) )
1458 {
1459 struct page_info *predecessor = pg - mask;
1460
1461 /* Merge with predecessor block? */
1462 if ( !mfn_valid(page_to_mfn(predecessor)) ||
1463 !page_state_is(predecessor, free) ||
1464 (PFN_ORDER(predecessor) != order) ||
1465 (phys_to_nid(page_to_maddr(predecessor)) != node) )
1466 break;
1467
1468 check_and_stop_scrub(predecessor);
1469
1470 page_list_del(predecessor, &heap(node, zone, order));
1471
1472 /* Update predecessor's first_dirty if necessary. */
1473 if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
1474 pg->u.free.first_dirty != INVALID_DIRTY_IDX )
1475 predecessor->u.free.first_dirty = (1U << order) +
1476 pg->u.free.first_dirty;
1477
1478 pg = predecessor;
1479 }
1480 else
1481 {
1482 struct page_info *successor = pg + mask;
1483
1484 /* Merge with successor block? */
1485 if ( !mfn_valid(page_to_mfn(successor)) ||
1486 !page_state_is(successor, free) ||
1487 (PFN_ORDER(successor) != order) ||
1488 (phys_to_nid(page_to_maddr(successor)) != node) )
1489 break;
1490
1491 check_and_stop_scrub(successor);
1492
1493 /* Update pg's first_dirty if necessary. */
1494 if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX &&
1495 successor->u.free.first_dirty != INVALID_DIRTY_IDX )
1496 pg->u.free.first_dirty = (1U << order) +
1497 successor->u.free.first_dirty;
1498
1499 page_list_del(successor, &heap(node, zone, order));
1500 }
1501
1502 order++;
1503 }
1504
1505 page_list_add_scrub(pg, node, zone, order, pg->u.free.first_dirty);
1506
1507 if ( tainted )
1508 reserve_offlined_page(pg);
1509
1510 spin_unlock(&heap_lock);
1511 }
1512
1513
1514 /*
1515 * Following rules applied for page offline:
1516 * Once a page is broken, it can't be assigned anymore
1517 * A page will be offlined only if it is free
1518 * return original count_info
1519 */
mark_page_offline(struct page_info * pg,int broken)1520 static unsigned long mark_page_offline(struct page_info *pg, int broken)
1521 {
1522 unsigned long nx, x, y = pg->count_info;
1523
1524 ASSERT(page_is_ram_type(mfn_x(page_to_mfn(pg)), RAM_TYPE_CONVENTIONAL));
1525 ASSERT(spin_is_locked(&heap_lock));
1526
1527 do {
1528 nx = x = y;
1529
1530 if ( ((x & PGC_state) != PGC_state_offlined) &&
1531 ((x & PGC_state) != PGC_state_offlining) )
1532 {
1533 nx &= ~PGC_state;
1534 nx |= (((x & PGC_state) == PGC_state_free)
1535 ? PGC_state_offlined : PGC_state_offlining);
1536 }
1537
1538 if ( broken )
1539 nx |= PGC_broken;
1540
1541 if ( x == nx )
1542 break;
1543 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1544
1545 return y;
1546 }
1547
reserve_heap_page(struct page_info * pg)1548 static int reserve_heap_page(struct page_info *pg)
1549 {
1550 struct page_info *head = NULL;
1551 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
1552 unsigned int zone = page_to_zone(pg);
1553
1554 for ( i = 0; i <= MAX_ORDER; i++ )
1555 {
1556 struct page_info *tmp;
1557
1558 if ( page_list_empty(&heap(node, zone, i)) )
1559 continue;
1560
1561 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
1562 {
1563 if ( (head <= pg) &&
1564 (head + (1UL << i) > pg) )
1565 return reserve_offlined_page(head);
1566 }
1567 }
1568
1569 return -EINVAL;
1570
1571 }
1572
offline_page(mfn_t mfn,int broken,uint32_t * status)1573 int offline_page(mfn_t mfn, int broken, uint32_t *status)
1574 {
1575 unsigned long old_info = 0;
1576 struct domain *owner;
1577 struct page_info *pg;
1578
1579 if ( !mfn_valid(mfn) )
1580 {
1581 dprintk(XENLOG_WARNING,
1582 "try to offline out of range page %"PRI_mfn"\n", mfn_x(mfn));
1583 return -EINVAL;
1584 }
1585
1586 *status = 0;
1587 pg = mfn_to_page(mfn);
1588
1589 if ( is_xen_fixed_mfn(mfn) )
1590 {
1591 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
1592 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1593 return -EPERM;
1594 }
1595
1596 /*
1597 * N.B. xen's txt in x86_64 is marked reserved and handled already.
1598 * Also kexec range is reserved.
1599 */
1600 if ( !page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL) )
1601 {
1602 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
1603 return -EINVAL;
1604 }
1605
1606 /*
1607 * NB. When broken page belong to guest, usually hypervisor will
1608 * notify the guest to handle the broken page. However, hypervisor
1609 * need to prevent malicious guest access the broken page again.
1610 * Under such case, hypervisor shutdown guest, preventing recursive mce.
1611 */
1612 if ( (pg->count_info & PGC_broken) && (owner = page_get_owner(pg)) )
1613 {
1614 *status = PG_OFFLINE_AGAIN;
1615 domain_crash(owner);
1616 return 0;
1617 }
1618
1619 spin_lock(&heap_lock);
1620
1621 old_info = mark_page_offline(pg, broken);
1622
1623 if ( page_state_is(pg, offlined) )
1624 {
1625 reserve_heap_page(pg);
1626
1627 spin_unlock(&heap_lock);
1628
1629 *status = broken ? PG_OFFLINE_OFFLINED | PG_OFFLINE_BROKEN
1630 : PG_OFFLINE_OFFLINED;
1631 return 0;
1632 }
1633
1634 spin_unlock(&heap_lock);
1635
1636 if ( (owner = page_get_owner_and_reference(pg)) )
1637 {
1638 if ( p2m_pod_offline_or_broken_hit(pg) )
1639 {
1640 put_page(pg);
1641 p2m_pod_offline_or_broken_replace(pg);
1642 *status = PG_OFFLINE_OFFLINED;
1643 }
1644 else
1645 {
1646 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
1647 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
1648 /* Release the reference since it will not be allocated anymore */
1649 put_page(pg);
1650 }
1651 }
1652 else if ( old_info & PGC_xen_heap )
1653 {
1654 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
1655 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1656 }
1657 else
1658 {
1659 /*
1660 * assign_pages does not hold heap_lock, so small window that the owner
1661 * may be set later, but please notice owner will only change from
1662 * NULL to be set, not verse, since page is offlining now.
1663 * No windows If called from #MC handler, since all CPU are in softirq
1664 * If called from user space like CE handling, tools can wait some time
1665 * before call again.
1666 */
1667 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
1668 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
1669 }
1670
1671 if ( broken )
1672 *status |= PG_OFFLINE_BROKEN;
1673
1674 return 0;
1675 }
1676
1677 /*
1678 * Online the memory.
1679 * The caller should make sure end_pfn <= max_page,
1680 * if not, expand_pages() should be called prior to online_page().
1681 */
online_page(mfn_t mfn,uint32_t * status)1682 unsigned int online_page(mfn_t mfn, uint32_t *status)
1683 {
1684 unsigned long x, nx, y;
1685 struct page_info *pg;
1686 int ret;
1687
1688 if ( !mfn_valid(mfn) )
1689 {
1690 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1691 return -EINVAL;
1692 }
1693
1694 pg = mfn_to_page(mfn);
1695
1696 spin_lock(&heap_lock);
1697
1698 y = pg->count_info;
1699 do {
1700 ret = *status = 0;
1701
1702 if ( y & PGC_broken )
1703 {
1704 ret = -EINVAL;
1705 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
1706 break;
1707 }
1708
1709 if ( (y & PGC_state) == PGC_state_offlined )
1710 {
1711 page_list_del(pg, &page_offlined_list);
1712 *status = PG_ONLINE_ONLINED;
1713 }
1714 else if ( (y & PGC_state) == PGC_state_offlining )
1715 {
1716 *status = PG_ONLINE_ONLINED;
1717 }
1718 else
1719 {
1720 break;
1721 }
1722
1723 x = y;
1724 nx = (x & ~PGC_state) | PGC_state_inuse;
1725 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1726
1727 spin_unlock(&heap_lock);
1728
1729 if ( (y & PGC_state) == PGC_state_offlined )
1730 free_heap_pages(pg, 0, false);
1731
1732 return ret;
1733 }
1734
query_page_offline(mfn_t mfn,uint32_t * status)1735 int query_page_offline(mfn_t mfn, uint32_t *status)
1736 {
1737 struct page_info *pg;
1738
1739 if ( !mfn_valid(mfn) || !page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL) )
1740 {
1741 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1742 return -EINVAL;
1743 }
1744
1745 *status = 0;
1746 spin_lock(&heap_lock);
1747
1748 pg = mfn_to_page(mfn);
1749
1750 if ( page_state_is(pg, offlining) )
1751 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
1752 if ( pg->count_info & PGC_broken )
1753 *status |= PG_OFFLINE_STATUS_BROKEN;
1754 if ( page_state_is(pg, offlined) )
1755 *status |= PG_OFFLINE_STATUS_OFFLINED;
1756
1757 spin_unlock(&heap_lock);
1758
1759 return 0;
1760 }
1761
1762 /*
1763 * Hand the specified arbitrary page range to the specified heap zone
1764 * checking the node_id of the previous page. If they differ and the
1765 * latter is not on a MAX_ORDER boundary, then we reserve the page by
1766 * not freeing it to the buddy allocator.
1767 */
init_heap_pages(struct page_info * pg,unsigned long nr_pages)1768 static void init_heap_pages(
1769 struct page_info *pg, unsigned long nr_pages)
1770 {
1771 unsigned long i;
1772 bool idle_scrub = false;
1773
1774 /*
1775 * Keep MFN 0 away from the buddy allocator to avoid crossing zone
1776 * boundary when merging two buddies.
1777 */
1778 if ( !mfn_x(page_to_mfn(pg)) )
1779 {
1780 if ( nr_pages-- <= 1 )
1781 return;
1782 pg++;
1783 }
1784
1785
1786 /*
1787 * Some pages may not go through the boot allocator (e.g reserved
1788 * memory at boot but released just after --- kernel, initramfs,
1789 * etc.).
1790 * Update first_valid_mfn to ensure those regions are covered.
1791 */
1792 spin_lock(&heap_lock);
1793 first_valid_mfn = mfn_min(page_to_mfn(pg), first_valid_mfn);
1794 spin_unlock(&heap_lock);
1795
1796 if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE )
1797 idle_scrub = true;
1798
1799 for ( i = 0; i < nr_pages; i++ )
1800 {
1801 unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
1802
1803 if ( unlikely(!avail[nid]) )
1804 {
1805 unsigned long s = mfn_x(page_to_mfn(pg + i));
1806 unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1));
1807 bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
1808 !(s & ((1UL << MAX_ORDER) - 1)) &&
1809 (find_first_set_bit(e) <= find_first_set_bit(s));
1810 unsigned long n;
1811
1812 n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), nr_pages - i,
1813 &use_tail);
1814 BUG_ON(i + n > nr_pages);
1815 if ( n && !use_tail )
1816 {
1817 i += n - 1;
1818 continue;
1819 }
1820 if ( i + n == nr_pages )
1821 break;
1822 nr_pages -= n;
1823 }
1824
1825 free_heap_pages(pg + i, 0, scrub_debug || idle_scrub);
1826 }
1827 }
1828
avail_heap_pages(unsigned int zone_lo,unsigned int zone_hi,unsigned int node)1829 static unsigned long avail_heap_pages(
1830 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
1831 {
1832 unsigned int i, zone;
1833 unsigned long free_pages = 0;
1834
1835 if ( zone_hi >= NR_ZONES )
1836 zone_hi = NR_ZONES - 1;
1837
1838 for_each_online_node(i)
1839 {
1840 if ( !avail[i] )
1841 continue;
1842 for ( zone = zone_lo; zone <= zone_hi; zone++ )
1843 if ( (node == -1) || (node == i) )
1844 free_pages += avail[i][zone];
1845 }
1846
1847 return free_pages;
1848 }
1849
end_boot_allocator(void)1850 void __init end_boot_allocator(void)
1851 {
1852 unsigned int i;
1853
1854 /* Pages that are free now go to the domain sub-allocator. */
1855 for ( i = 0; i < nr_bootmem_regions; i++ )
1856 {
1857 struct bootmem_region *r = &bootmem_region_list[i];
1858 if ( (r->s < r->e) &&
1859 (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
1860 {
1861 init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
1862 r->e = r->s;
1863 break;
1864 }
1865 }
1866 for ( i = nr_bootmem_regions; i-- > 0; )
1867 {
1868 struct bootmem_region *r = &bootmem_region_list[i];
1869 if ( r->s < r->e )
1870 init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
1871 }
1872 nr_bootmem_regions = 0;
1873
1874 if ( !dma_bitsize && (num_online_nodes() > 1) )
1875 dma_bitsize = arch_get_dma_bitsize();
1876
1877 printk("Domain heap initialised");
1878 if ( dma_bitsize )
1879 printk(" DMA width %u bits", dma_bitsize);
1880 printk("\n");
1881 }
1882
smp_scrub_heap_pages(void * data)1883 static void __init smp_scrub_heap_pages(void *data)
1884 {
1885 unsigned long mfn, start, end;
1886 struct page_info *pg;
1887 struct scrub_region *r;
1888 unsigned int temp_cpu, cpu_idx = 0;
1889 nodeid_t node;
1890 unsigned int cpu = smp_processor_id();
1891
1892 if ( data )
1893 r = data;
1894 else
1895 {
1896 node = cpu_to_node(cpu);
1897 if ( node == NUMA_NO_NODE )
1898 return;
1899 r = ®ion[node];
1900 }
1901
1902 /* Determine the current CPU's index into CPU's linked to this node. */
1903 for_each_cpu ( temp_cpu, &r->cpus )
1904 {
1905 if ( cpu == temp_cpu )
1906 break;
1907 cpu_idx++;
1908 }
1909
1910 /* Calculate the starting mfn for this CPU's memory block. */
1911 start = r->start + (r->per_cpu_sz * cpu_idx) + r->offset;
1912
1913 /* Calculate the end mfn into this CPU's memory block for this iteration. */
1914 if ( r->offset + chunk_size >= r->per_cpu_sz )
1915 {
1916 end = r->start + (r->per_cpu_sz * cpu_idx) + r->per_cpu_sz;
1917
1918 if ( r->rem && (cpumask_weight(&r->cpus) - 1 == cpu_idx) )
1919 end += r->rem;
1920 }
1921 else
1922 end = start + chunk_size;
1923
1924 for ( mfn = start; mfn < end; mfn++ )
1925 {
1926 pg = mfn_to_page(_mfn(mfn));
1927
1928 /* Check the mfn is valid and page is free. */
1929 if ( !mfn_valid(_mfn(mfn)) || !page_state_is(pg, free) )
1930 continue;
1931
1932 scrub_one_page(pg);
1933 }
1934 }
1935
find_non_smt(unsigned int node,cpumask_t * dest)1936 static int __init find_non_smt(unsigned int node, cpumask_t *dest)
1937 {
1938 cpumask_t node_cpus;
1939 unsigned int i, cpu;
1940
1941 cpumask_and(&node_cpus, &node_to_cpumask(node), &cpu_online_map);
1942 cpumask_clear(dest);
1943 for_each_cpu ( i, &node_cpus )
1944 {
1945 if ( cpumask_intersects(dest, per_cpu(cpu_sibling_mask, i)) )
1946 continue;
1947 cpu = cpumask_first(per_cpu(cpu_sibling_mask, i));
1948 __cpumask_set_cpu(cpu, dest);
1949 }
1950 return cpumask_weight(dest);
1951 }
1952
1953 /*
1954 * Scrub all unallocated pages in all heap zones. This function uses all
1955 * online cpu's to scrub the memory in parallel.
1956 */
scrub_heap_pages(void)1957 static void __init scrub_heap_pages(void)
1958 {
1959 cpumask_t node_cpus, all_worker_cpus;
1960 unsigned int i, j;
1961 unsigned long offset, max_per_cpu_sz = 0;
1962 unsigned long start, end;
1963 unsigned long rem = 0;
1964 int last_distance, best_node;
1965 int cpus;
1966
1967 cpumask_clear(&all_worker_cpus);
1968 /* Scrub block size. */
1969 chunk_size = opt_bootscrub_chunk >> PAGE_SHIFT;
1970 if ( chunk_size == 0 )
1971 chunk_size = MB(128) >> PAGE_SHIFT;
1972
1973 /* Round #0 - figure out amounts and which CPUs to use. */
1974 for_each_online_node ( i )
1975 {
1976 if ( !node_spanned_pages(i) )
1977 continue;
1978 /* Calculate Node memory start and end address. */
1979 start = max(node_start_pfn(i), mfn_x(first_valid_mfn));
1980 end = min(node_start_pfn(i) + node_spanned_pages(i), max_page);
1981 /* Just in case NODE has 1 page and starts below first_valid_mfn. */
1982 end = max(end, start);
1983 /* CPUs that are online and on this node (if none, that it is OK). */
1984 cpus = find_non_smt(i, &node_cpus);
1985 cpumask_or(&all_worker_cpus, &all_worker_cpus, &node_cpus);
1986 if ( cpus <= 0 )
1987 {
1988 /* No CPUs on this node. Round #2 will take of it. */
1989 rem = 0;
1990 region[i].per_cpu_sz = (end - start);
1991 }
1992 else
1993 {
1994 rem = (end - start) % cpus;
1995 region[i].per_cpu_sz = (end - start) / cpus;
1996 if ( region[i].per_cpu_sz > max_per_cpu_sz )
1997 max_per_cpu_sz = region[i].per_cpu_sz;
1998 }
1999 region[i].start = start;
2000 region[i].rem = rem;
2001 cpumask_copy(®ion[i].cpus, &node_cpus);
2002 }
2003
2004 printk("Scrubbing Free RAM on %d nodes using %d CPUs\n", num_online_nodes(),
2005 cpumask_weight(&all_worker_cpus));
2006
2007 /* Round: #1 - do NUMA nodes with CPUs. */
2008 for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
2009 {
2010 for_each_online_node ( i )
2011 region[i].offset = offset;
2012
2013 process_pending_softirqs();
2014
2015 spin_lock(&heap_lock);
2016 on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, NULL, 1);
2017 spin_unlock(&heap_lock);
2018
2019 printk(".");
2020 }
2021
2022 /*
2023 * Round #2: NUMA nodes with no CPUs get scrubbed with CPUs on the node
2024 * closest to us and with CPUs.
2025 */
2026 for_each_online_node ( i )
2027 {
2028 node_cpus = node_to_cpumask(i);
2029
2030 if ( !cpumask_empty(&node_cpus) )
2031 continue;
2032
2033 last_distance = INT_MAX;
2034 best_node = first_node(node_online_map);
2035 /* Figure out which NODE CPUs are close. */
2036 for_each_online_node ( j )
2037 {
2038 u8 distance;
2039
2040 if ( cpumask_empty(&node_to_cpumask(j)) )
2041 continue;
2042
2043 distance = __node_distance(i, j);
2044 if ( (distance < last_distance) && (distance != NUMA_NO_DISTANCE) )
2045 {
2046 last_distance = distance;
2047 best_node = j;
2048 }
2049 }
2050 /*
2051 * Use CPUs from best node, and if there are no CPUs on the
2052 * first node (the default) use the BSP.
2053 */
2054 cpus = find_non_smt(best_node, &node_cpus);
2055 if ( cpus == 0 )
2056 {
2057 __cpumask_set_cpu(smp_processor_id(), &node_cpus);
2058 cpus = 1;
2059 }
2060 /* We already have the node information from round #0. */
2061 region[i].rem = region[i].per_cpu_sz % cpus;
2062 region[i].per_cpu_sz /= cpus;
2063 max_per_cpu_sz = region[i].per_cpu_sz;
2064 cpumask_copy(®ion[i].cpus, &node_cpus);
2065
2066 for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
2067 {
2068 region[i].offset = offset;
2069
2070 process_pending_softirqs();
2071
2072 spin_lock(&heap_lock);
2073 on_selected_cpus(&node_cpus, smp_scrub_heap_pages, ®ion[i], 1);
2074 spin_unlock(&heap_lock);
2075
2076 printk(".");
2077 }
2078 }
2079
2080 printk("done.\n");
2081
2082 #ifdef CONFIG_SCRUB_DEBUG
2083 scrub_debug = true;
2084 #endif
2085 }
2086
heap_init_late(void)2087 void __init heap_init_late(void)
2088 {
2089 /*
2090 * Now that the heap is initialized set bounds
2091 * for the low mem virq algorithm.
2092 */
2093 setup_low_mem_virq();
2094
2095 switch ( opt_bootscrub )
2096 {
2097 default:
2098 ASSERT_UNREACHABLE();
2099 /* Fall through */
2100
2101 case BOOTSCRUB_IDLE:
2102 printk("Scrubbing Free RAM in background\n");
2103 break;
2104
2105 case BOOTSCRUB_ON:
2106 scrub_heap_pages();
2107 break;
2108
2109 case BOOTSCRUB_OFF:
2110 break;
2111 }
2112 }
2113
2114
2115 /*************************
2116 * XEN-HEAP SUB-ALLOCATOR
2117 */
2118
2119 #if defined(CONFIG_SEPARATE_XENHEAP)
2120
init_xenheap_pages(paddr_t ps,paddr_t pe)2121 void init_xenheap_pages(paddr_t ps, paddr_t pe)
2122 {
2123 ps = round_pgup(ps);
2124 pe = round_pgdown(pe);
2125 if ( pe <= ps )
2126 return;
2127
2128 /*
2129 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
2130 * prevent merging of power-of-two blocks across the zone boundary.
2131 */
2132 if ( ps && !is_xen_heap_mfn(mfn_add(maddr_to_mfn(ps), -1)) )
2133 ps += PAGE_SIZE;
2134 if ( !is_xen_heap_mfn(maddr_to_mfn(pe)) )
2135 pe -= PAGE_SIZE;
2136
2137 memguard_guard_range(maddr_to_virt(ps), pe - ps);
2138
2139 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
2140 }
2141
2142
alloc_xenheap_pages(unsigned int order,unsigned int memflags)2143 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2144 {
2145 struct page_info *pg;
2146
2147 ASSERT(!in_irq());
2148
2149 pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
2150 order, memflags | MEMF_no_scrub, NULL);
2151 if ( unlikely(pg == NULL) )
2152 return NULL;
2153
2154 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
2155
2156 return page_to_virt(pg);
2157 }
2158
2159
free_xenheap_pages(void * v,unsigned int order)2160 void free_xenheap_pages(void *v, unsigned int order)
2161 {
2162 ASSERT(!in_irq());
2163
2164 if ( v == NULL )
2165 return;
2166
2167 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
2168
2169 free_heap_pages(virt_to_page(v), order, false);
2170 }
2171
2172 #else /* !CONFIG_SEPARATE_XENHEAP */
2173
xenheap_max_mfn(unsigned long mfn)2174 void __init xenheap_max_mfn(unsigned long mfn)
2175 {
2176 ASSERT(!first_node_initialised);
2177 ASSERT(!xenheap_bits);
2178 BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
2179 xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
2180 printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
2181 }
2182
init_xenheap_pages(paddr_t ps,paddr_t pe)2183 void init_xenheap_pages(paddr_t ps, paddr_t pe)
2184 {
2185 init_domheap_pages(ps, pe);
2186 }
2187
alloc_xenheap_pages(unsigned int order,unsigned int memflags)2188 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2189 {
2190 struct page_info *pg;
2191 unsigned int i;
2192
2193 ASSERT(!in_irq());
2194
2195 if ( xenheap_bits && (memflags >> _MEMF_bits) > xenheap_bits )
2196 memflags &= ~MEMF_bits(~0U);
2197 if ( !(memflags >> _MEMF_bits) )
2198 memflags |= MEMF_bits(xenheap_bits);
2199
2200 pg = alloc_domheap_pages(NULL, order, memflags | MEMF_no_scrub);
2201 if ( unlikely(pg == NULL) )
2202 return NULL;
2203
2204 for ( i = 0; i < (1u << order); i++ )
2205 pg[i].count_info |= PGC_xen_heap;
2206
2207 return page_to_virt(pg);
2208 }
2209
free_xenheap_pages(void * v,unsigned int order)2210 void free_xenheap_pages(void *v, unsigned int order)
2211 {
2212 struct page_info *pg;
2213 unsigned int i;
2214
2215 ASSERT(!in_irq());
2216
2217 if ( v == NULL )
2218 return;
2219
2220 pg = virt_to_page(v);
2221
2222 for ( i = 0; i < (1u << order); i++ )
2223 pg[i].count_info &= ~PGC_xen_heap;
2224
2225 free_heap_pages(pg, order, true);
2226 }
2227
2228 #endif /* CONFIG_SEPARATE_XENHEAP */
2229
2230
2231
2232 /*************************
2233 * DOMAIN-HEAP SUB-ALLOCATOR
2234 */
2235
init_domheap_pages(paddr_t ps,paddr_t pe)2236 void init_domheap_pages(paddr_t ps, paddr_t pe)
2237 {
2238 mfn_t smfn, emfn;
2239
2240 ASSERT(!in_irq());
2241
2242 smfn = maddr_to_mfn(round_pgup(ps));
2243 emfn = maddr_to_mfn(round_pgdown(pe));
2244
2245 if ( mfn_x(emfn) <= mfn_x(smfn) )
2246 return;
2247
2248 init_heap_pages(mfn_to_page(smfn), mfn_x(emfn) - mfn_x(smfn));
2249 }
2250
2251
assign_pages(struct domain * d,struct page_info * pg,unsigned int order,unsigned int memflags)2252 int assign_pages(
2253 struct domain *d,
2254 struct page_info *pg,
2255 unsigned int order,
2256 unsigned int memflags)
2257 {
2258 int rc = 0;
2259 unsigned long i;
2260
2261 spin_lock(&d->page_alloc_lock);
2262
2263 if ( unlikely(d->is_dying) )
2264 {
2265 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
2266 d->domain_id);
2267 rc = -EINVAL;
2268 goto out;
2269 }
2270
2271 #ifndef NDEBUG
2272 {
2273 unsigned int extra_pages = 0;
2274
2275 for ( i = 0; i < (1ul << order); i++ )
2276 {
2277 ASSERT(!(pg[i].count_info & ~PGC_extra));
2278 if ( pg[i].count_info & PGC_extra )
2279 extra_pages++;
2280 }
2281
2282 ASSERT(!extra_pages ||
2283 ((memflags & MEMF_no_refcount) &&
2284 extra_pages == 1u << order));
2285 }
2286 #endif
2287
2288 if ( pg[0].count_info & PGC_extra )
2289 {
2290 d->extra_pages += 1u << order;
2291 memflags &= ~MEMF_no_refcount;
2292 }
2293 else if ( !(memflags & MEMF_no_refcount) )
2294 {
2295 unsigned int tot_pages = domain_tot_pages(d) + (1 << order);
2296
2297 if ( unlikely(tot_pages > d->max_pages) )
2298 {
2299 gprintk(XENLOG_INFO, "Over-allocation for domain %u: "
2300 "%u > %u\n", d->domain_id, tot_pages, d->max_pages);
2301 rc = -E2BIG;
2302 goto out;
2303 }
2304 }
2305
2306 if ( !(memflags & MEMF_no_refcount) &&
2307 unlikely(domain_adjust_tot_pages(d, 1 << order) == (1 << order)) )
2308 get_knownalive_domain(d);
2309
2310 for ( i = 0; i < (1 << order); i++ )
2311 {
2312 ASSERT(page_get_owner(&pg[i]) == NULL);
2313 page_set_owner(&pg[i], d);
2314 smp_wmb(); /* Domain pointer must be visible before updating refcnt. */
2315 pg[i].count_info =
2316 (pg[i].count_info & PGC_extra) | PGC_allocated | 1;
2317 page_list_add_tail(&pg[i], page_to_list(d, &pg[i]));
2318 }
2319
2320 out:
2321 spin_unlock(&d->page_alloc_lock);
2322 return rc;
2323 }
2324
2325
alloc_domheap_pages(struct domain * d,unsigned int order,unsigned int memflags)2326 struct page_info *alloc_domheap_pages(
2327 struct domain *d, unsigned int order, unsigned int memflags)
2328 {
2329 struct page_info *pg = NULL;
2330 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
2331 unsigned int dma_zone;
2332
2333 ASSERT(!in_irq());
2334
2335 bits = domain_clamp_alloc_bitsize(memflags & MEMF_no_owner ? NULL : d,
2336 bits ? : (BITS_PER_LONG+PAGE_SHIFT));
2337 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
2338 return NULL;
2339
2340 if ( memflags & MEMF_no_owner )
2341 memflags |= MEMF_no_refcount;
2342
2343 if ( !dma_bitsize )
2344 memflags &= ~MEMF_no_dma;
2345 else if ( (dma_zone = bits_to_zone(dma_bitsize)) < zone_hi )
2346 pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
2347
2348 if ( (pg == NULL) &&
2349 ((memflags & MEMF_no_dma) ||
2350 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
2351 memflags, d)) == NULL)) )
2352 return NULL;
2353
2354 if ( d && !(memflags & MEMF_no_owner) )
2355 {
2356 if ( memflags & MEMF_no_refcount )
2357 {
2358 unsigned long i;
2359
2360 for ( i = 0; i < (1ul << order); i++ )
2361 {
2362 ASSERT(!pg[i].count_info);
2363 pg[i].count_info = PGC_extra;
2364 }
2365 }
2366 if ( assign_pages(d, pg, order, memflags) )
2367 {
2368 free_heap_pages(pg, order, memflags & MEMF_no_scrub);
2369 return NULL;
2370 }
2371 }
2372
2373 return pg;
2374 }
2375
free_domheap_pages(struct page_info * pg,unsigned int order)2376 void free_domheap_pages(struct page_info *pg, unsigned int order)
2377 {
2378 struct domain *d = page_get_owner(pg);
2379 unsigned int i;
2380 bool drop_dom_ref;
2381
2382 ASSERT(!in_irq());
2383
2384 if ( unlikely(is_xen_heap_page(pg)) )
2385 {
2386 /* NB. May recursively lock from relinquish_memory(). */
2387 spin_lock_recursive(&d->page_alloc_lock);
2388
2389 for ( i = 0; i < (1 << order); i++ )
2390 arch_free_heap_page(d, &pg[i]);
2391
2392 d->xenheap_pages -= 1 << order;
2393 drop_dom_ref = (d->xenheap_pages == 0);
2394
2395 spin_unlock_recursive(&d->page_alloc_lock);
2396 }
2397 else
2398 {
2399 bool scrub;
2400
2401 if ( likely(d) && likely(d != dom_cow) )
2402 {
2403 /* NB. May recursively lock from relinquish_memory(). */
2404 spin_lock_recursive(&d->page_alloc_lock);
2405
2406 for ( i = 0; i < (1 << order); i++ )
2407 {
2408 if ( pg[i].u.inuse.type_info & PGT_count_mask )
2409 {
2410 printk(XENLOG_ERR
2411 "pg[%u] MFN %"PRI_mfn" c=%#lx o=%u v=%#lx t=%#x\n",
2412 i, mfn_x(page_to_mfn(pg + i)),
2413 pg[i].count_info, pg[i].v.free.order,
2414 pg[i].u.free.val, pg[i].tlbflush_timestamp);
2415 BUG();
2416 }
2417 arch_free_heap_page(d, &pg[i]);
2418 if ( pg[i].count_info & PGC_extra )
2419 {
2420 ASSERT(d->extra_pages);
2421 d->extra_pages--;
2422 }
2423 }
2424
2425 drop_dom_ref = !domain_adjust_tot_pages(d, -(1 << order));
2426
2427 spin_unlock_recursive(&d->page_alloc_lock);
2428
2429 /*
2430 * Normally we expect a domain to clear pages before freeing them,
2431 * if it cares about the secrecy of their contents. However, after
2432 * a domain has died we assume responsibility for erasure. We do
2433 * scrub regardless if option scrub_domheap is set.
2434 */
2435 scrub = d->is_dying || scrub_debug || opt_scrub_domheap;
2436 }
2437 else
2438 {
2439 /*
2440 * All we need to check is that on dom_cow only order-0 chunks
2441 * make it here. Due to the if() above, the only two possible
2442 * cases right now are d == NULL and d == dom_cow. To protect
2443 * against relaxation of that if() condition without updating the
2444 * check here, don't check d != dom_cow for now.
2445 */
2446 ASSERT(!d || !order);
2447 drop_dom_ref = false;
2448 scrub = 1;
2449 }
2450
2451 free_heap_pages(pg, order, scrub);
2452 }
2453
2454 if ( drop_dom_ref )
2455 put_domain(d);
2456 }
2457
avail_domheap_pages_region(unsigned int node,unsigned int min_width,unsigned int max_width)2458 unsigned long avail_domheap_pages_region(
2459 unsigned int node, unsigned int min_width, unsigned int max_width)
2460 {
2461 int zone_lo, zone_hi;
2462
2463 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
2464 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
2465
2466 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
2467 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
2468
2469 return avail_heap_pages(zone_lo, zone_hi, node);
2470 }
2471
avail_domheap_pages(void)2472 unsigned long avail_domheap_pages(void)
2473 {
2474 return avail_heap_pages(MEMZONE_XEN + 1,
2475 NR_ZONES - 1,
2476 -1);
2477 }
2478
avail_node_heap_pages(unsigned int nodeid)2479 unsigned long avail_node_heap_pages(unsigned int nodeid)
2480 {
2481 return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
2482 }
2483
2484
pagealloc_info(unsigned char key)2485 static void pagealloc_info(unsigned char key)
2486 {
2487 unsigned int zone = MEMZONE_XEN;
2488 unsigned long n, total = 0;
2489
2490 printk("Physical memory information:\n");
2491 printk(" Xen heap: %lukB free\n",
2492 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
2493
2494 while ( ++zone < NR_ZONES )
2495 {
2496 if ( (zone + PAGE_SHIFT) == dma_bitsize )
2497 {
2498 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
2499 total = 0;
2500 }
2501
2502 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
2503 {
2504 total += n;
2505 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
2506 }
2507 }
2508
2509 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
2510 }
2511
pagealloc_keyhandler_init(void)2512 static __init int pagealloc_keyhandler_init(void)
2513 {
2514 register_keyhandler('m', pagealloc_info, "memory info", 1);
2515 return 0;
2516 }
2517 __initcall(pagealloc_keyhandler_init);
2518
2519
scrub_one_page(struct page_info * pg)2520 void scrub_one_page(struct page_info *pg)
2521 {
2522 if ( unlikely(pg->count_info & PGC_broken) )
2523 return;
2524
2525 #ifndef NDEBUG
2526 /* Avoid callers relying on allocations returning zeroed pages. */
2527 unmap_domain_page(memset(__map_domain_page(pg),
2528 SCRUB_BYTE_PATTERN, PAGE_SIZE));
2529 #else
2530 /* For a production build, clear_page() is the fastest way to scrub. */
2531 clear_domain_page(_mfn(page_to_mfn(pg)));
2532 #endif
2533 }
2534
dump_heap(unsigned char key)2535 static void dump_heap(unsigned char key)
2536 {
2537 s_time_t now = NOW();
2538 int i, j;
2539
2540 printk("'%c' pressed -> dumping heap info (now = %"PRI_stime")\n", key,
2541 now);
2542
2543 for ( i = 0; i < MAX_NUMNODES; i++ )
2544 {
2545 if ( !avail[i] )
2546 continue;
2547 for ( j = 0; j < NR_ZONES; j++ )
2548 printk("heap[node=%d][zone=%d] -> %lu pages\n",
2549 i, j, avail[i][j]);
2550 }
2551
2552 for ( i = 0; i < MAX_NUMNODES; i++ )
2553 {
2554 if ( !node_need_scrub[i] )
2555 continue;
2556 printk("Node %d has %lu unscrubbed pages\n", i, node_need_scrub[i]);
2557 }
2558 }
2559
register_heap_trigger(void)2560 static __init int register_heap_trigger(void)
2561 {
2562 register_keyhandler('H', dump_heap, "dump heap info", 1);
2563 return 0;
2564 }
2565 __initcall(register_heap_trigger);
2566
get_pg_owner(domid_t domid)2567 struct domain *get_pg_owner(domid_t domid)
2568 {
2569 struct domain *pg_owner = NULL, *curr = current->domain;
2570
2571 if ( likely(domid == DOMID_SELF) )
2572 {
2573 pg_owner = rcu_lock_current_domain();
2574 goto out;
2575 }
2576
2577 if ( unlikely(domid == curr->domain_id) )
2578 {
2579 gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n");
2580 goto out;
2581 }
2582
2583 switch ( domid )
2584 {
2585 case DOMID_IO:
2586 pg_owner = rcu_lock_domain(dom_io);
2587 break;
2588
2589 case DOMID_XEN:
2590 pg_owner = rcu_lock_domain(dom_xen);
2591 break;
2592
2593 default:
2594 if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
2595 gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid);
2596 break;
2597 }
2598
2599 out:
2600 return pg_owner;
2601 }
2602
2603 /*
2604 * Local variables:
2605 * mode: C
2606 * c-file-style: "BSD"
2607 * c-basic-offset: 4
2608 * tab-width: 4
2609 * indent-tabs-mode: nil
2610 * End:
2611 */
2612