1 #include <xen/init.h>
2 #include <xen/lib.h>
3 #include <xen/mm.h>
4 #include <xen/param.h>
5 #include <xen/compat.h>
6 #include <xen/dmi.h>
7 #include <xen/pfn.h>
8 #include <asm/e820.h>
9 #include <asm/page.h>
10 #include <asm/processor.h>
11 #include <asm/mtrr.h>
12 #include <asm/msr.h>
13 #include <asm/guest.h>
14
15 /*
16 * opt_mem: Limit maximum address of physical RAM.
17 * Any RAM beyond this address limit is ignored.
18 */
19 static unsigned long long __initdata opt_mem;
20 size_param("mem", opt_mem);
21
22 /*
23 * opt_availmem: Limit maximum usable amount of physical RAM.
24 * Any RAM beyond this limited amount is ignored.
25 */
26 static unsigned long long __initdata opt_availmem;
27 size_param("availmem", opt_availmem);
28
29 /* opt_nomtrr_check: Don't clip ram to highest cacheable MTRR. */
30 static s8 __initdata e820_mtrr_clip = -1;
31 boolean_param("e820-mtrr-clip", e820_mtrr_clip);
32
33 /* opt_e820_verbose: Be verbose about clipping, the original e820, &c */
34 static bool __initdata e820_verbose;
35 boolean_param("e820-verbose", e820_verbose);
36
37 struct e820map e820;
38 struct e820map __initdata e820_raw;
39
40 /*
41 * This function checks if the entire range [start,end) is mapped with type.
42 *
43 * Note: this function only works correct if the e820 table is sorted and
44 * not-overlapping, which is the case
45 */
e820_all_mapped(u64 start,u64 end,unsigned type)46 int __init e820_all_mapped(u64 start, u64 end, unsigned type)
47 {
48 unsigned int i;
49
50 for (i = 0; i < e820.nr_map; i++) {
51 struct e820entry *ei = &e820.map[i];
52
53 if (type && ei->type != type)
54 continue;
55 /* is the region (part) in overlap with the current region ?*/
56 if (ei->addr >= end || ei->addr + ei->size <= start)
57 continue;
58
59 /*
60 * If the region is at the beginning of [start,end) we move
61 * start to the end of the region since it's ok until there
62 */
63 if (ei->addr <= start)
64 start = ei->addr + ei->size;
65 /*
66 * if start is now at or beyond end, we're done, full
67 * coverage
68 */
69 if (start >= end)
70 return 1;
71 }
72 return 0;
73 }
74
add_memory_region(unsigned long long start,unsigned long long size,int type)75 static void __init add_memory_region(unsigned long long start,
76 unsigned long long size, int type)
77 {
78 unsigned int x = e820.nr_map;
79
80 if (x == ARRAY_SIZE(e820.map)) {
81 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
82 return;
83 }
84
85 e820.map[x].addr = start;
86 e820.map[x].size = size;
87 e820.map[x].type = type;
88 e820.nr_map++;
89 }
90
print_e820_memory_map(struct e820entry * map,unsigned int entries)91 static void __init print_e820_memory_map(struct e820entry *map, unsigned int entries)
92 {
93 unsigned int i;
94
95 for (i = 0; i < entries; i++) {
96 printk(" [%016Lx, %016Lx] ",
97 (unsigned long long)(map[i].addr),
98 (unsigned long long)(map[i].addr + map[i].size) - 1);
99 switch (map[i].type) {
100 case E820_RAM:
101 printk("(usable)\n");
102 break;
103 case E820_RESERVED:
104 printk("(reserved)\n");
105 break;
106 case E820_ACPI:
107 printk("(ACPI data)\n");
108 break;
109 case E820_NVS:
110 printk("(ACPI NVS)\n");
111 break;
112 case E820_UNUSABLE:
113 printk("(unusable)\n");
114 break;
115 default:
116 printk("type %u\n", map[i].type);
117 break;
118 }
119 }
120 }
121
122 /*
123 * Sanitize the BIOS e820 map.
124 *
125 * Some e820 responses include overlapping entries. The following
126 * replaces the original e820 map with a new one, removing overlaps.
127 *
128 */
129 struct change_member {
130 struct e820entry *pbios; /* pointer to original bios entry */
131 unsigned long long addr; /* address for this change point */
132 };
133 static struct change_member change_point_list[2*E820MAX] __initdata;
134 static struct change_member *change_point[2*E820MAX] __initdata;
135 static struct e820entry *overlap_list[E820MAX] __initdata;
136 static struct e820entry new_bios[E820MAX] __initdata;
137
sanitize_e820_map(struct e820entry * biosmap,unsigned int * pnr_map)138 int __init sanitize_e820_map(struct e820entry *biosmap, unsigned int *pnr_map)
139 {
140 struct change_member *change_tmp;
141 unsigned long current_type, last_type;
142 unsigned long long last_addr;
143 bool still_changing;
144 unsigned int i, chgidx, overlap_entries, new_bios_entry;
145 unsigned int old_nr, new_nr, chg_nr;
146
147 /*
148 Visually we're performing the following (1,2,3,4 = memory types)...
149
150 Sample memory map (w/overlaps):
151 ____22__________________
152 ______________________4_
153 ____1111________________
154 _44_____________________
155 11111111________________
156 ____________________33__
157 ___________44___________
158 __________33333_________
159 ______________22________
160 ___________________2222_
161 _________111111111______
162 _____________________11_
163 _________________4______
164
165 Sanitized equivalent (no overlap):
166 1_______________________
167 _44_____________________
168 ___1____________________
169 ____22__________________
170 ______11________________
171 _________1______________
172 __________3_____________
173 ___________44___________
174 _____________33_________
175 _______________2________
176 ________________1_______
177 _________________4______
178 ___________________2____
179 ____________________33__
180 ______________________4_
181 */
182
183 /* if there's only one memory region, don't bother */
184 if (*pnr_map < 2)
185 return -1;
186
187 old_nr = *pnr_map;
188
189 /* bail out if we find any unreasonable addresses in bios map */
190 for (i=0; i<old_nr; i++)
191 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
192 return -1;
193
194 /* create pointers for initial change-point information (for sorting) */
195 for (i=0; i < 2*old_nr; i++)
196 change_point[i] = &change_point_list[i];
197
198 /* record all known change-points (starting and ending addresses),
199 omitting those that are for empty memory regions */
200 chgidx = 0;
201 for (i=0; i < old_nr; i++) {
202 if (biosmap[i].size != 0) {
203 change_point[chgidx]->addr = biosmap[i].addr;
204 change_point[chgidx++]->pbios = &biosmap[i];
205 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
206 change_point[chgidx++]->pbios = &biosmap[i];
207 }
208 }
209 chg_nr = chgidx; /* true number of change-points */
210
211 /* sort change-point list by memory addresses (low -> high) */
212 still_changing = true;
213 while (still_changing) {
214 still_changing = false;
215 for (i=1; i < chg_nr; i++) {
216 /* if <current_addr> > <last_addr>, swap */
217 /* or, if current=<start_addr> & last=<end_addr>, swap */
218 if ((change_point[i]->addr < change_point[i-1]->addr) ||
219 ((change_point[i]->addr == change_point[i-1]->addr) &&
220 (change_point[i]->addr == change_point[i]->pbios->addr) &&
221 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
222 )
223 {
224 change_tmp = change_point[i];
225 change_point[i] = change_point[i-1];
226 change_point[i-1] = change_tmp;
227 still_changing = true;
228 }
229 }
230 }
231
232 /* create a new bios memory map, removing overlaps */
233 overlap_entries=0; /* number of entries in the overlap table */
234 new_bios_entry=0; /* index for creating new bios map entries */
235 last_type = 0; /* start with undefined memory type */
236 last_addr = 0; /* start with 0 as last starting address */
237 /* loop through change-points, determining affect on the new bios map */
238 for (chgidx=0; chgidx < chg_nr; chgidx++)
239 {
240 /* keep track of all overlapping bios entries */
241 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
242 {
243 /* add map entry to overlap list (> 1 entry implies an overlap) */
244 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
245 }
246 else
247 {
248 /* remove entry from list (order independent, so swap with last) */
249 for (i=0; i<overlap_entries; i++)
250 {
251 if (overlap_list[i] == change_point[chgidx]->pbios)
252 overlap_list[i] = overlap_list[overlap_entries-1];
253 }
254 overlap_entries--;
255 }
256 /* if there are overlapping entries, decide which "type" to use */
257 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
258 current_type = 0;
259 for (i=0; i<overlap_entries; i++)
260 if (overlap_list[i]->type > current_type)
261 current_type = overlap_list[i]->type;
262 /* continue building up new bios map based on this information */
263 if (current_type != last_type) {
264 if (last_type != 0) {
265 new_bios[new_bios_entry].size =
266 change_point[chgidx]->addr - last_addr;
267 /* move forward only if the new size was non-zero */
268 if (new_bios[new_bios_entry].size != 0)
269 if (++new_bios_entry >= ARRAY_SIZE(new_bios))
270 break; /* no more space left for new bios entries */
271 }
272 if (current_type != 0) {
273 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
274 new_bios[new_bios_entry].type = current_type;
275 last_addr=change_point[chgidx]->addr;
276 }
277 last_type = current_type;
278 }
279 }
280 new_nr = new_bios_entry; /* retain count for new bios entries */
281
282 /* copy new bios mapping into original location */
283 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
284 *pnr_map = new_nr;
285
286 return 0;
287 }
288
289 /*
290 * Copy the BIOS e820 map into a safe place.
291 *
292 * Sanity-check it while we're at it..
293 *
294 * If we're lucky and live on a modern system, the setup code
295 * will have given us a memory map that we can use to properly
296 * set up memory. If we aren't, we'll fake a memory map.
297 *
298 * We check to see that the memory map contains at least 2 elements
299 * before we'll use it, because the detection code in setup.S may
300 * not be perfect and most every PC known to man has two memory
301 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
302 * thinkpad 560x, for example, does not cooperate with the memory
303 * detection code.)
304 */
copy_e820_map(struct e820entry * biosmap,unsigned int nr_map)305 static int __init copy_e820_map(struct e820entry * biosmap, unsigned int nr_map)
306 {
307 /* Only one memory region? Ignore it */
308 if (nr_map < 2)
309 return -1;
310
311 do {
312 unsigned long long start = biosmap->addr;
313 unsigned long long size = biosmap->size;
314 unsigned long long end = start + size;
315 unsigned long type = biosmap->type;
316
317 /* Overflow in 64 bits? Ignore the memory map. */
318 if (start > end)
319 return -1;
320
321 /*
322 * Some BIOSes claim RAM in the 640k - 1M region.
323 * Not right. Fix it up, but only when running on bare metal.
324 */
325 if (!cpu_has_hypervisor && type == E820_RAM) {
326 if (start < 0x100000ULL && end > 0xA0000ULL) {
327 if (start < 0xA0000ULL)
328 add_memory_region(start, 0xA0000ULL-start, type);
329 if (end <= 0x100000ULL)
330 continue;
331 start = 0x100000ULL;
332 size = end - start;
333 }
334 }
335 add_memory_region(start, size, type);
336 } while (biosmap++,--nr_map);
337 return 0;
338 }
339
340
341 /*
342 * Find the highest page frame number we have available
343 */
find_max_pfn(void)344 static unsigned long __init find_max_pfn(void)
345 {
346 unsigned int i;
347 unsigned long max_pfn = 0;
348
349 for (i = 0; i < e820.nr_map; i++) {
350 unsigned long start, end;
351 /* RAM? */
352 if (e820.map[i].type != E820_RAM)
353 continue;
354 start = PFN_UP(e820.map[i].addr);
355 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
356 if (start >= end)
357 continue;
358 if (end > max_pfn)
359 max_pfn = end;
360 }
361
362 return max_pfn;
363 }
364
clip_to_limit(uint64_t limit,char * warnmsg)365 static void __init clip_to_limit(uint64_t limit, char *warnmsg)
366 {
367 unsigned int i;
368 char _warnmsg[160];
369 uint64_t old_limit = 0;
370
371 for ( ; ; )
372 {
373 /* Find a RAM region needing clipping. */
374 for ( i = 0; i < e820.nr_map; i++ )
375 if ( (e820.map[i].type == E820_RAM) &&
376 ((e820.map[i].addr + e820.map[i].size) > limit) )
377 break;
378
379 /* If none found, we are done. */
380 if ( i == e820.nr_map )
381 break;
382
383 old_limit = max_t(
384 uint64_t, old_limit, e820.map[i].addr + e820.map[i].size);
385
386 /* We try to convert clipped RAM areas to E820_UNUSABLE. */
387 if ( e820_change_range_type(&e820, max(e820.map[i].addr, limit),
388 e820.map[i].addr + e820.map[i].size,
389 E820_RAM, E820_UNUSABLE) )
390 continue;
391
392 /*
393 * If the type change fails (e.g., not space in table) then we clip or
394 * delete the region as appropriate.
395 */
396 if ( e820.map[i].addr < limit )
397 {
398 e820.map[i].size = limit - e820.map[i].addr;
399 }
400 else
401 {
402 memmove(&e820.map[i], &e820.map[i+1],
403 (e820.nr_map - i - 1) * sizeof(struct e820entry));
404 e820.nr_map--;
405 }
406 }
407
408 if ( old_limit )
409 {
410 if ( warnmsg )
411 {
412 snprintf(_warnmsg, sizeof(_warnmsg), warnmsg, (long)(limit>>30));
413 printk("WARNING: %s\n", _warnmsg);
414 }
415 printk("Truncating RAM from %lukB to %lukB\n",
416 (unsigned long)(old_limit >> 10), (unsigned long)(limit >> 10));
417 }
418 }
419
420 /* Conservative estimate of top-of-RAM by looking for MTRR WB regions. */
mtrr_top_of_ram(void)421 static uint64_t __init mtrr_top_of_ram(void)
422 {
423 uint32_t eax, ebx, ecx, edx;
424 uint64_t mtrr_cap, mtrr_def, addr_mask, base, mask, top;
425 unsigned int i;
426
427 /* By default we check only Intel systems. */
428 if ( e820_mtrr_clip == -1 )
429 {
430 char vendor[13];
431 cpuid(0x00000000, &eax,
432 (uint32_t *)&vendor[0],
433 (uint32_t *)&vendor[8],
434 (uint32_t *)&vendor[4]);
435 vendor[12] = '\0';
436 e820_mtrr_clip = !strcmp(vendor, "GenuineIntel");
437 }
438
439 if ( !e820_mtrr_clip )
440 return 0;
441
442 if ( e820_verbose )
443 printk("Checking MTRR ranges...\n");
444
445 /* Does the CPU support architectural MTRRs? */
446 cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
447 if ( !test_bit(X86_FEATURE_MTRR & 31, &edx) )
448 return 0;
449
450 /* paddr_bits must have been set at this point */
451 ASSERT(paddr_bits);
452 addr_mask = ((1ull << paddr_bits) - 1) & PAGE_MASK;
453
454 rdmsrl(MSR_MTRRcap, mtrr_cap);
455 rdmsrl(MSR_MTRRdefType, mtrr_def);
456
457 if ( e820_verbose )
458 printk(" MTRR cap: %"PRIx64" type: %"PRIx64"\n", mtrr_cap, mtrr_def);
459
460 /* MTRRs enabled, and default memory type is not writeback? */
461 if ( !test_bit(11, &mtrr_def) || ((uint8_t)mtrr_def == MTRR_TYPE_WRBACK) )
462 return 0;
463
464 /*
465 * Find end of highest WB-type range. This is a conservative estimate
466 * of the highest WB address since overlapping UC/WT ranges dominate.
467 */
468 top = 0;
469 for ( i = 0; i < (uint8_t)mtrr_cap; i++ )
470 {
471 rdmsrl(MSR_IA32_MTRR_PHYSBASE(i), base);
472 rdmsrl(MSR_IA32_MTRR_PHYSMASK(i), mask);
473
474 if ( e820_verbose )
475 printk(" MTRR[%d]: base %"PRIx64" mask %"PRIx64"\n",
476 i, base, mask);
477
478 if ( !test_bit(11, &mask) || ((uint8_t)base != MTRR_TYPE_WRBACK) )
479 continue;
480 base &= addr_mask;
481 mask &= addr_mask;
482 top = max_t(uint64_t, top, ((base | ~mask) & addr_mask) + PAGE_SIZE);
483 }
484
485 return top;
486 }
487
reserve_dmi_region(void)488 static void __init reserve_dmi_region(void)
489 {
490 for ( ; ; )
491 {
492 paddr_t base;
493 u32 len;
494 const char *what = dmi_get_table(&base, &len);
495
496 if ( !what )
497 break;
498 if ( ((base + len) > base) &&
499 reserve_e820_ram(&e820, base, base + len) )
500 printk("WARNING: %s table located in E820 RAM %"PRIpaddr"-%"PRIpaddr". Fixed.\n",
501 what, base, base + len);
502 }
503 }
504
machine_specific_memory_setup(struct e820map * raw)505 static void __init machine_specific_memory_setup(struct e820map *raw)
506 {
507 unsigned long mpt_limit, ro_mpt_limit;
508 uint64_t top_of_ram, size;
509 unsigned int i;
510
511 sanitize_e820_map(raw->map, &raw->nr_map);
512 copy_e820_map(raw->map, raw->nr_map);
513
514 if ( opt_mem )
515 clip_to_limit(opt_mem, NULL);
516
517 if ( opt_availmem )
518 {
519 for ( i = size = 0; (i < e820.nr_map) && (size <= opt_availmem); i++ )
520 if ( e820.map[i].type == E820_RAM )
521 size += e820.map[i].size;
522 if ( size > opt_availmem )
523 clip_to_limit(
524 e820.map[i-1].addr + e820.map[i-1].size - (size-opt_availmem),
525 NULL);
526 }
527
528 mpt_limit = ((RDWR_MPT_VIRT_END - RDWR_MPT_VIRT_START)
529 / sizeof(unsigned long)) << PAGE_SHIFT;
530 ro_mpt_limit = ((RO_MPT_VIRT_END - RO_MPT_VIRT_START)
531 / sizeof(unsigned long)) << PAGE_SHIFT;
532 if ( mpt_limit > ro_mpt_limit )
533 mpt_limit = ro_mpt_limit;
534 clip_to_limit(mpt_limit,
535 "Only the first %lu GB of the physical "
536 "memory map can be accessed by Xen.");
537
538 reserve_dmi_region();
539
540 top_of_ram = mtrr_top_of_ram();
541 if ( top_of_ram )
542 clip_to_limit(top_of_ram, "MTRRs do not cover all of memory.");
543 }
544
545 /* This function relies on the passed in e820->map[] being sorted. */
e820_add_range(struct e820map * e820,uint64_t s,uint64_t e,uint32_t type)546 int __init e820_add_range(
547 struct e820map *e820, uint64_t s, uint64_t e, uint32_t type)
548 {
549 unsigned int i;
550
551 for ( i = 0; i < e820->nr_map; ++i )
552 {
553 uint64_t rs = e820->map[i].addr;
554 uint64_t re = rs + e820->map[i].size;
555
556 if ( rs == e && e820->map[i].type == type )
557 {
558 e820->map[i].addr = s;
559 return 1;
560 }
561
562 if ( re == s && e820->map[i].type == type &&
563 (i + 1 == e820->nr_map || e820->map[i + 1].addr >= e) )
564 {
565 e820->map[i].size += e - s;
566 return 1;
567 }
568
569 if ( rs >= e )
570 break;
571
572 if ( re > s )
573 return 0;
574 }
575
576 if ( e820->nr_map >= ARRAY_SIZE(e820->map) )
577 {
578 printk(XENLOG_WARNING "E820: overflow while adding region"
579 " %"PRIx64"-%"PRIx64"\n", s, e);
580 return 0;
581 }
582
583 memmove(e820->map + i + 1, e820->map + i,
584 (e820->nr_map - i) * sizeof(*e820->map));
585
586 e820->nr_map++;
587 e820->map[i].addr = s;
588 e820->map[i].size = e - s;
589 e820->map[i].type = type;
590
591 return 1;
592 }
593
e820_change_range_type(struct e820map * e820,uint64_t s,uint64_t e,uint32_t orig_type,uint32_t new_type)594 int __init e820_change_range_type(
595 struct e820map *e820, uint64_t s, uint64_t e,
596 uint32_t orig_type, uint32_t new_type)
597 {
598 uint64_t rs = 0, re = 0;
599 unsigned int i;
600
601 for ( i = 0; i < e820->nr_map; i++ )
602 {
603 /* Have we found the e820 region that includes the specified range? */
604 rs = e820->map[i].addr;
605 re = rs + e820->map[i].size;
606 if ( (s >= rs) && (e <= re) )
607 break;
608 }
609
610 if ( (i == e820->nr_map) || (e820->map[i].type != orig_type) )
611 return 0;
612
613 if ( (s == rs) && (e == re) )
614 {
615 e820->map[i].type = new_type;
616 }
617 else if ( (s == rs) || (e == re) )
618 {
619 if ( (e820->nr_map + 1) > ARRAY_SIZE(e820->map) )
620 goto overflow;
621
622 memmove(&e820->map[i+1], &e820->map[i],
623 (e820->nr_map-i) * sizeof(e820->map[0]));
624 e820->nr_map++;
625
626 if ( s == rs )
627 {
628 e820->map[i].size = e - s;
629 e820->map[i].type = new_type;
630 e820->map[i+1].addr = e;
631 e820->map[i+1].size = re - e;
632 }
633 else
634 {
635 e820->map[i].size = s - rs;
636 e820->map[i+1].addr = s;
637 e820->map[i+1].size = e - s;
638 e820->map[i+1].type = new_type;
639 }
640 }
641 else
642 {
643 if ( (e820->nr_map + 2) > ARRAY_SIZE(e820->map) )
644 goto overflow;
645
646 memmove(&e820->map[i+2], &e820->map[i],
647 (e820->nr_map-i) * sizeof(e820->map[0]));
648 e820->nr_map += 2;
649
650 e820->map[i].size = s - rs;
651 e820->map[i+1].addr = s;
652 e820->map[i+1].size = e - s;
653 e820->map[i+1].type = new_type;
654 e820->map[i+2].addr = e;
655 e820->map[i+2].size = re - e;
656 }
657
658 /* Finally, look for any opportunities to merge adjacent e820 entries. */
659 for ( i = 0; i < (e820->nr_map - 1); i++ )
660 {
661 if ( (e820->map[i].type != e820->map[i+1].type) ||
662 ((e820->map[i].addr + e820->map[i].size) != e820->map[i+1].addr) )
663 continue;
664 e820->map[i].size += e820->map[i+1].size;
665 memmove(&e820->map[i+1], &e820->map[i+2],
666 (e820->nr_map-i-2) * sizeof(e820->map[0]));
667 e820->nr_map--;
668 i--;
669 }
670
671 return 1;
672
673 overflow:
674 printk("Overflow in e820 while reserving region %"PRIx64"-%"PRIx64"\n",
675 s, e);
676 return 0;
677 }
678
679 /* Set E820_RAM area (@s,@e) as RESERVED in specified e820 map. */
reserve_e820_ram(struct e820map * e820,uint64_t s,uint64_t e)680 int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e)
681 {
682 return e820_change_range_type(e820, s, e, E820_RAM, E820_RESERVED);
683 }
684
init_e820(const char * str,struct e820map * raw)685 unsigned long __init init_e820(const char *str, struct e820map *raw)
686 {
687 if ( e820_verbose )
688 {
689 printk("Initial %s RAM map:\n", str);
690 print_e820_memory_map(raw->map, raw->nr_map);
691 }
692
693 machine_specific_memory_setup(raw);
694
695 if ( cpu_has_hypervisor )
696 hypervisor_e820_fixup(&e820);
697
698 printk("%s RAM map:\n", str);
699 print_e820_memory_map(e820.map, e820.nr_map);
700
701 return find_max_pfn();
702 }
703