1 #include <xen/init.h>
2 #include <xen/lib.h>
3 #include <xen/err.h>
4 #include <xen/grant_table.h>
5 #include <xen/param.h>
6 #include <xen/sched.h>
7 #include <xen/domain.h>
8 #include <xen/serial.h>
9 #include <xen/softirq.h>
10 #include <xen/acpi.h>
11 #include <xen/efi.h>
12 #include <xen/console.h>
13 #include <xen/serial.h>
14 #include <xen/trace.h>
15 #include <xen/multiboot.h>
16 #include <xen/domain_page.h>
17 #include <xen/version.h>
18 #include <xen/gdbstub.h>
19 #include <xen/hypercall.h>
20 #include <xen/keyhandler.h>
21 #include <xen/numa.h>
22 #include <xen/rcupdate.h>
23 #include <xen/vga.h>
24 #include <xen/dmi.h>
25 #include <xen/pfn.h>
26 #include <xen/nodemask.h>
27 #include <xen/virtual_region.h>
28 #include <xen/watchdog.h>
29 #include <public/version.h>
30 #include <compat/platform.h>
31 #include <compat/xen.h>
32 #include <xen/bitops.h>
33 #include <asm/smp.h>
34 #include <asm/processor.h>
35 #include <asm/mpspec.h>
36 #include <asm/apic.h>
37 #include <asm/msi.h>
38 #include <asm/desc.h>
39 #include <asm/paging.h>
40 #include <asm/e820.h>
41 #include <xen/kexec.h>
42 #include <asm/edd.h>
43 #include <xsm/xsm.h>
44 #include <asm/tboot.h>
45 #include <asm/bzimage.h> /* for bzimage_headroom */
46 #include <asm/mach-generic/mach_apic.h> /* for generic_apic_probe */
47 #include <asm/setup.h>
48 #include <xen/cpu.h>
49 #include <asm/nmi.h>
50 #include <asm/alternative.h>
51 #include <asm/mc146818rtc.h>
52 #include <asm/cpuid.h>
53 #include <asm/spec_ctrl.h>
54 #include <asm/guest.h>
55 #include <asm/microcode.h>
56 #include <asm/pv/domain.h>
57
58 /* opt_nosmp: If true, secondary processors are ignored. */
59 static bool __initdata opt_nosmp;
60 boolean_param("nosmp", opt_nosmp);
61
62 /* maxcpus: maximum number of CPUs to activate. */
63 static unsigned int __initdata max_cpus;
64 integer_param("maxcpus", max_cpus);
65
66 int8_t __read_mostly opt_smt = -1;
67 boolean_param("smt", opt_smt);
68
69 /* opt_invpcid: If false, don't use INVPCID instruction even if available. */
70 static bool __initdata opt_invpcid = true;
71 boolean_param("invpcid", opt_invpcid);
72 bool __read_mostly use_invpcid;
73
74 unsigned long __read_mostly cr4_pv32_mask;
75
76 /* **** Linux config option: propagated to domain0. */
77 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
78 /* "acpi=force": Override the disable blacklist. */
79 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
80 /* "acpi=noirq": Disables ACPI interrupt routing. */
81 static int parse_acpi_param(const char *s);
82 custom_param("acpi", parse_acpi_param);
83
84 /* **** Linux config option: propagated to domain0. */
85 /* noapic: Disable IOAPIC setup. */
86 boolean_param("noapic", skip_ioapic_setup);
87
88 /* **** Linux config option: propagated to domain0. */
89 /* xen_cpuidle: xen control cstate. */
90 s8 __read_mostly xen_cpuidle = -1;
91 boolean_param("cpuidle", xen_cpuidle);
92
93 #ifndef NDEBUG
94 unsigned long __initdata highmem_start;
95 size_param("highmem-start", highmem_start);
96 #endif
97
98 #ifdef CONFIG_XEN_SHSTK
99 static bool __initdata opt_xen_shstk = true;
100 #else
101 #define opt_xen_shstk false
102 #endif
103
parse_cet(const char * s)104 static int __init parse_cet(const char *s)
105 {
106 const char *ss;
107 int val, rc = 0;
108
109 do {
110 ss = strchr(s, ',');
111 if ( !ss )
112 ss = strchr(s, '\0');
113
114 if ( (val = parse_boolean("shstk", s, ss)) >= 0 )
115 {
116 #ifdef CONFIG_XEN_SHSTK
117 opt_xen_shstk = val;
118 #else
119 no_config_param("XEN_SHSTK", "cet", s, ss);
120 #endif
121 }
122 else
123 rc = -EINVAL;
124
125 s = ss + 1;
126 } while ( *ss );
127
128 return rc;
129 }
130 custom_param("cet", parse_cet);
131
132 cpumask_t __read_mostly cpu_present_map;
133
134 unsigned long __read_mostly xen_phys_start;
135
136 unsigned long __read_mostly xen_virt_end;
137
138 char __section(".bss.stack_aligned") __aligned(STACK_SIZE)
139 cpu0_stack[STACK_SIZE];
140
141 struct cpuinfo_x86 __read_mostly boot_cpu_data = { 0, 0, 0, 0, -1 };
142
143 unsigned long __read_mostly mmu_cr4_features = XEN_MINIMAL_CR4;
144
145 /* smep: Enable/disable Supervisor Mode Execution Protection */
146 #define SMEP_HVM_ONLY (-2)
147 static s8 __initdata opt_smep = -1;
148
149 /*
150 * Initial domain place holder. Needs to be global so it can be created in
151 * __start_xen and unpaused in init_done.
152 */
153 static struct domain *__initdata dom0;
154
parse_smep_param(const char * s)155 static int __init parse_smep_param(const char *s)
156 {
157 if ( !*s )
158 {
159 opt_smep = 1;
160 return 0;
161 }
162
163 switch ( parse_bool(s, NULL) )
164 {
165 case 0:
166 opt_smep = 0;
167 return 0;
168 case 1:
169 opt_smep = 1;
170 return 0;
171 }
172
173 if ( !strcmp(s, "hvm") )
174 opt_smep = SMEP_HVM_ONLY;
175 else
176 return -EINVAL;
177
178 return 0;
179 }
180 custom_param("smep", parse_smep_param);
181
182 /* smap: Enable/disable Supervisor Mode Access Prevention */
183 #define SMAP_HVM_ONLY (-2)
184 static s8 __initdata opt_smap = -1;
185
parse_smap_param(const char * s)186 static int __init parse_smap_param(const char *s)
187 {
188 if ( !*s )
189 {
190 opt_smap = 1;
191 return 0;
192 }
193
194 switch ( parse_bool(s, NULL) )
195 {
196 case 0:
197 opt_smap = 0;
198 return 0;
199 case 1:
200 opt_smap = 1;
201 return 0;
202 }
203
204 if ( !strcmp(s, "hvm") )
205 opt_smap = SMAP_HVM_ONLY;
206 else
207 return -EINVAL;
208
209 return 0;
210 }
211 custom_param("smap", parse_smap_param);
212
213 bool __read_mostly acpi_disabled;
214 bool __initdata acpi_force;
215 static char __initdata acpi_param[10] = "";
216
parse_acpi_param(const char * s)217 static int __init parse_acpi_param(const char *s)
218 {
219 /* Save the parameter so it can be propagated to domain0. */
220 safe_strcpy(acpi_param, s);
221
222 /* Interpret the parameter for use within Xen. */
223 if ( !parse_bool(s, NULL) )
224 {
225 disable_acpi();
226 }
227 else if ( !strcmp(s, "force") )
228 {
229 acpi_force = true;
230 acpi_ht = 1;
231 acpi_disabled = false;
232 }
233 else if ( !strcmp(s, "ht") )
234 {
235 if ( !acpi_force )
236 disable_acpi();
237 acpi_ht = 1;
238 }
239 else if ( !strcmp(s, "noirq") )
240 {
241 acpi_noirq_set();
242 }
243 else
244 return -EINVAL;
245
246 return 0;
247 }
248
249 static const module_t *__initdata initial_images;
250 static unsigned int __initdata nr_initial_images;
251
initial_images_nrpages(nodeid_t node)252 unsigned long __init initial_images_nrpages(nodeid_t node)
253 {
254 unsigned long node_start = node_start_pfn(node);
255 unsigned long node_end = node_end_pfn(node);
256 unsigned long nr;
257 unsigned int i;
258
259 for ( nr = i = 0; i < nr_initial_images; ++i )
260 {
261 unsigned long start = initial_images[i].mod_start;
262 unsigned long end = start + PFN_UP(initial_images[i].mod_end);
263
264 if ( end > node_start && node_end > start )
265 nr += min(node_end, end) - max(node_start, start);
266 }
267
268 return nr;
269 }
270
discard_initial_images(void)271 void __init discard_initial_images(void)
272 {
273 unsigned int i;
274
275 for ( i = 0; i < nr_initial_images; ++i )
276 {
277 uint64_t start = (uint64_t)initial_images[i].mod_start << PAGE_SHIFT;
278
279 init_domheap_pages(start,
280 start + PAGE_ALIGN(initial_images[i].mod_end));
281 }
282
283 nr_initial_images = 0;
284 initial_images = NULL;
285 }
286
287 extern char __init_begin[], __init_end[], __bss_start[], __bss_end[];
288
init_idle_domain(void)289 static void __init init_idle_domain(void)
290 {
291 scheduler_init();
292 set_current(idle_vcpu[0]);
293 this_cpu(curr_vcpu) = current;
294 }
295
srat_detect_node(int cpu)296 void srat_detect_node(int cpu)
297 {
298 nodeid_t node;
299 u32 apicid = x86_cpu_to_apicid[cpu];
300
301 node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
302 if ( node == NUMA_NO_NODE )
303 node = 0;
304
305 node_set_online(node);
306 numa_set_node(cpu, node);
307
308 if ( opt_cpu_info && acpi_numa > 0 )
309 printk("CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
310 }
311
312 /*
313 * Sort CPUs by <node,package,core,thread> tuple. Fortunately this hierarchy is
314 * reflected in the structure of modern APIC identifiers, so we sort based on
315 * those. This is slightly complicated by the fact that the BSP must remain
316 * CPU 0. Hence we do a variation on longest-prefix matching to do the best we
317 * can while keeping CPU 0 static.
318 */
normalise_cpu_order(void)319 static void __init normalise_cpu_order(void)
320 {
321 unsigned int i, j, min_cpu;
322 uint32_t apicid, diff, min_diff;
323
324 for_each_present_cpu ( i )
325 {
326 apicid = x86_cpu_to_apicid[i];
327 min_diff = min_cpu = ~0u;
328
329 /*
330 * Find remaining CPU with longest-prefix match on APIC ID.
331 * Among identical longest-prefix matches, pick the smallest APIC ID.
332 */
333 for ( j = cpumask_next(i, &cpu_present_map);
334 j < nr_cpu_ids;
335 j = cpumask_next(j, &cpu_present_map) )
336 {
337 diff = x86_cpu_to_apicid[j] ^ apicid;
338 while ( diff & (diff-1) )
339 diff &= diff-1;
340 if ( (diff < min_diff) ||
341 ((diff == min_diff) &&
342 (x86_cpu_to_apicid[j] < x86_cpu_to_apicid[min_cpu])) )
343 {
344 min_diff = diff;
345 min_cpu = j;
346 }
347 }
348
349 /* If no match then there must be no CPUs remaining to consider. */
350 if ( min_cpu >= nr_cpu_ids )
351 {
352 BUG_ON(cpumask_next(i, &cpu_present_map) < nr_cpu_ids);
353 break;
354 }
355
356 /* Switch the best-matching CPU with the next CPU in logical order. */
357 j = cpumask_next(i, &cpu_present_map);
358 apicid = x86_cpu_to_apicid[min_cpu];
359 x86_cpu_to_apicid[min_cpu] = x86_cpu_to_apicid[j];
360 x86_cpu_to_apicid[j] = apicid;
361 }
362 }
363
364 #define BOOTSTRAP_MAP_BASE (16UL << 20)
365 #define BOOTSTRAP_MAP_LIMIT (1UL << L3_PAGETABLE_SHIFT)
366
367 /*
368 * Ensure a given physical memory range is present in the bootstrap mappings.
369 * Use superpage mappings to ensure that pagetable memory needn't be allocated.
370 */
bootstrap_map(const module_t * mod)371 void *__init bootstrap_map(const module_t *mod)
372 {
373 static unsigned long __initdata map_cur = BOOTSTRAP_MAP_BASE;
374 uint64_t start, end, mask = (1L << L2_PAGETABLE_SHIFT) - 1;
375 void *ret;
376
377 if ( system_state != SYS_STATE_early_boot )
378 return mod ? mfn_to_virt(mod->mod_start) : NULL;
379
380 if ( !mod )
381 {
382 destroy_xen_mappings(BOOTSTRAP_MAP_BASE, BOOTSTRAP_MAP_LIMIT);
383 map_cur = BOOTSTRAP_MAP_BASE;
384 return NULL;
385 }
386
387 start = (uint64_t)mod->mod_start << PAGE_SHIFT;
388 end = start + mod->mod_end;
389 if ( start >= end )
390 return NULL;
391
392 ret = (void *)(map_cur + (unsigned long)(start & mask));
393 start &= ~mask;
394 end = (end + mask) & ~mask;
395 if ( end - start > BOOTSTRAP_MAP_LIMIT - map_cur )
396 return NULL;
397
398 map_pages_to_xen(map_cur, maddr_to_mfn(start),
399 PFN_DOWN(end - start), PAGE_HYPERVISOR);
400 map_cur += end - start;
401 return ret;
402 }
403
move_memory(uint64_t dst,uint64_t src,unsigned int size,bool keep)404 static void *__init move_memory(
405 uint64_t dst, uint64_t src, unsigned int size, bool keep)
406 {
407 unsigned int blksz = BOOTSTRAP_MAP_LIMIT - BOOTSTRAP_MAP_BASE;
408 unsigned int mask = (1L << L2_PAGETABLE_SHIFT) - 1;
409
410 if ( src + size > BOOTSTRAP_MAP_BASE )
411 blksz >>= 1;
412
413 while ( size )
414 {
415 module_t mod;
416 unsigned int soffs = src & mask;
417 unsigned int doffs = dst & mask;
418 unsigned int sz;
419 void *d, *s;
420
421 mod.mod_start = (src - soffs) >> PAGE_SHIFT;
422 mod.mod_end = soffs + size;
423 if ( mod.mod_end > blksz )
424 mod.mod_end = blksz;
425 sz = mod.mod_end - soffs;
426 s = bootstrap_map(&mod);
427
428 mod.mod_start = (dst - doffs) >> PAGE_SHIFT;
429 mod.mod_end = doffs + size;
430 if ( mod.mod_end > blksz )
431 mod.mod_end = blksz;
432 if ( sz > mod.mod_end - doffs )
433 sz = mod.mod_end - doffs;
434 d = bootstrap_map(&mod);
435
436 memmove(d + doffs, s + soffs, sz);
437
438 dst += sz;
439 src += sz;
440 size -= sz;
441
442 if ( keep )
443 return size ? NULL : d + doffs;
444
445 bootstrap_map(NULL);
446 }
447
448 return NULL;
449 }
450
451 #undef BOOTSTRAP_MAP_LIMIT
452
consider_modules(uint64_t s,uint64_t e,uint32_t size,const module_t * mod,unsigned int nr_mods,unsigned int this_mod)453 static uint64_t __init consider_modules(
454 uint64_t s, uint64_t e, uint32_t size, const module_t *mod,
455 unsigned int nr_mods, unsigned int this_mod)
456 {
457 unsigned int i;
458
459 if ( s > e || e - s < size )
460 return 0;
461
462 for ( i = 0; i < nr_mods ; ++i )
463 {
464 uint64_t start = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
465 uint64_t end = start + PAGE_ALIGN(mod[i].mod_end);
466
467 if ( i == this_mod )
468 continue;
469
470 if ( s < end && start < e )
471 {
472 end = consider_modules(end, e, size, mod + i + 1,
473 nr_mods - i - 1, this_mod - i - 1);
474 if ( end )
475 return end;
476
477 return consider_modules(s, start, size, mod + i + 1,
478 nr_mods - i - 1, this_mod - i - 1);
479 }
480 }
481
482 return e;
483 }
484
setup_max_pdx(unsigned long top_page)485 static void __init setup_max_pdx(unsigned long top_page)
486 {
487 max_pdx = pfn_to_pdx(top_page - 1) + 1;
488
489 if ( max_pdx > (DIRECTMAP_SIZE >> PAGE_SHIFT) )
490 max_pdx = DIRECTMAP_SIZE >> PAGE_SHIFT;
491
492 if ( max_pdx > FRAMETABLE_NR )
493 max_pdx = FRAMETABLE_NR;
494
495 if ( max_pdx > MPT_VIRT_SIZE / sizeof(unsigned long) )
496 max_pdx = MPT_VIRT_SIZE / sizeof(unsigned long);
497
498 #ifdef PAGE_LIST_NULL
499 if ( max_pdx >= PAGE_LIST_NULL )
500 max_pdx = PAGE_LIST_NULL - 1;
501 #endif
502
503 max_page = pdx_to_pfn(max_pdx - 1) + 1;
504 }
505
506 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
507 static struct e820map __initdata boot_e820;
508
509 #ifdef CONFIG_VIDEO
510 struct boot_video_info {
511 u8 orig_x; /* 0x00 */
512 u8 orig_y; /* 0x01 */
513 u8 orig_video_mode; /* 0x02 */
514 u8 orig_video_cols; /* 0x03 */
515 u8 orig_video_lines; /* 0x04 */
516 u8 orig_video_isVGA; /* 0x05 */
517 u16 orig_video_points; /* 0x06 */
518
519 /* VESA graphic mode -- linear frame buffer */
520 u32 capabilities; /* 0x08 */
521 u16 lfb_linelength; /* 0x0c */
522 u16 lfb_width; /* 0x0e */
523 u16 lfb_height; /* 0x10 */
524 u16 lfb_depth; /* 0x12 */
525 u32 lfb_base; /* 0x14 */
526 u32 lfb_size; /* 0x18 */
527 u8 red_size; /* 0x1c */
528 u8 red_pos; /* 0x1d */
529 u8 green_size; /* 0x1e */
530 u8 green_pos; /* 0x1f */
531 u8 blue_size; /* 0x20 */
532 u8 blue_pos; /* 0x21 */
533 u8 rsvd_size; /* 0x22 */
534 u8 rsvd_pos; /* 0x23 */
535 u16 vesapm_seg; /* 0x24 */
536 u16 vesapm_off; /* 0x26 */
537 u16 vesa_attrib; /* 0x28 */
538 };
539 extern struct boot_video_info boot_vid_info;
540 #endif
541
parse_video_info(void)542 static void __init parse_video_info(void)
543 {
544 #ifdef CONFIG_VIDEO
545 struct boot_video_info *bvi = &bootsym(boot_vid_info);
546
547 /* vga_console_info is filled directly on EFI platform. */
548 if ( efi_enabled(EFI_BOOT) )
549 return;
550
551 if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
552 {
553 vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
554 vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
555 vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
556 vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
557 vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
558 vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
559 }
560 else if ( bvi->orig_video_isVGA == 0x23 )
561 {
562 vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
563 vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
564 vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
565 vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
566 vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
567 vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
568 vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
569 vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
570 vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
571 vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
572 vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
573 vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
574 vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
575 vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
576 vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
577 vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
578 vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
579 }
580 #endif
581 }
582
kexec_reserve_area(struct e820map * e820)583 static void __init kexec_reserve_area(struct e820map *e820)
584 {
585 #ifdef CONFIG_KEXEC
586 unsigned long kdump_start = kexec_crash_area.start;
587 unsigned long kdump_size = kexec_crash_area.size;
588 static bool __initdata is_reserved = false;
589
590 kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
591
592 if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
593 return;
594
595 is_reserved = true;
596
597 if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
598 {
599 printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at %#lx)"
600 "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
601 kexec_crash_area.start = kexec_crash_area.size = 0;
602 }
603 else
604 {
605 printk("Kdump: %luMB (%lukB) at %#lx\n",
606 kdump_size >> 20, kdump_size >> 10, kdump_start);
607 }
608 #endif
609 }
610
using_2M_mapping(void)611 static inline bool using_2M_mapping(void)
612 {
613 return !l1_table_offset((unsigned long)__2M_text_end) &&
614 !l1_table_offset((unsigned long)__2M_rodata_start) &&
615 !l1_table_offset((unsigned long)__2M_rodata_end) &&
616 !l1_table_offset((unsigned long)__2M_init_start) &&
617 !l1_table_offset((unsigned long)__2M_init_end) &&
618 !l1_table_offset((unsigned long)__2M_rwdata_start) &&
619 !l1_table_offset((unsigned long)__2M_rwdata_end);
620 }
621
init_done(void)622 static void noinline init_done(void)
623 {
624 void *va;
625 unsigned long start, end;
626
627 system_state = SYS_STATE_active;
628
629 domain_unpause_by_systemcontroller(dom0);
630
631 /* MUST be done prior to removing .init data. */
632 unregister_init_virtual_region();
633
634 /* Zero the .init code and data. */
635 for ( va = __init_begin; va < _p(__init_end); va += PAGE_SIZE )
636 clear_page(va);
637
638 /* Destroy Xen's mappings, and reuse the pages. */
639 if ( using_2M_mapping() )
640 {
641 start = (unsigned long)&__2M_init_start,
642 end = (unsigned long)&__2M_init_end;
643 }
644 else
645 {
646 start = (unsigned long)&__init_begin;
647 end = (unsigned long)&__init_end;
648 }
649
650 destroy_xen_mappings(start, end);
651 init_xenheap_pages(__pa(start), __pa(end));
652 printk("Freed %lukB init memory\n", (end - start) >> 10);
653
654 startup_cpu_idle_loop();
655 }
656
657 /* Reinitalise all state referring to the old virtual address of the stack. */
reinit_bsp_stack(void)658 static void __init noreturn reinit_bsp_stack(void)
659 {
660 unsigned long *stack = (void*)(get_stack_bottom() & ~(STACK_SIZE - 1));
661
662 /* Update TSS and ISTs */
663 load_system_tables();
664
665 /* Update SYSCALL trampolines */
666 percpu_traps_init();
667
668 stack_base[0] = stack;
669 memguard_guard_stack(stack);
670
671 if ( IS_ENABLED(CONFIG_XEN_SHSTK) && cpu_has_xen_shstk )
672 {
673 wrmsrl(MSR_PL0_SSP,
674 (unsigned long)stack + (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8);
675 wrmsrl(MSR_S_CET, CET_SHSTK_EN | CET_WRSS_EN);
676 asm volatile ("setssbsy" ::: "memory");
677 }
678
679 reset_stack_and_jump(init_done);
680 }
681
682 /*
683 * Some scripts add "placeholder" to work around a grub error where it ate the
684 * first parameter.
685 */
686 ignore_param("placeholder");
687
loader_is_grub2(const char * loader_name)688 static bool __init loader_is_grub2(const char *loader_name)
689 {
690 /* GRUB1="GNU GRUB 0.xx"; GRUB2="GRUB 1.xx" */
691 const char *p = strstr(loader_name, "GRUB ");
692 return (p != NULL) && (p[5] != '0');
693 }
694
cmdline_cook(char * p,const char * loader_name)695 static char * __init cmdline_cook(char *p, const char *loader_name)
696 {
697 p = p ? : "";
698
699 /* Strip leading whitespace. */
700 while ( *p == ' ' )
701 p++;
702
703 /* GRUB2 and PVH don't not include image name as first item on command line. */
704 if ( xen_guest || loader_is_grub2(loader_name) )
705 return p;
706
707 /* Strip image name plus whitespace. */
708 while ( (*p != ' ') && (*p != '\0') )
709 p++;
710 while ( *p == ' ' )
711 p++;
712
713 return p;
714 }
715
copy_bios_e820(struct e820entry * map,unsigned int limit)716 static unsigned int __init copy_bios_e820(struct e820entry *map, unsigned int limit)
717 {
718 unsigned int n = min(bootsym(bios_e820nr), limit);
719
720 if ( n )
721 memcpy(map, bootsym(bios_e820map), sizeof(*map) * n);
722
723 return n;
724 }
725
create_dom0(const module_t * image,unsigned long headroom,module_t * initrd,const char * kextra,const char * loader)726 static struct domain *__init create_dom0(const module_t *image,
727 unsigned long headroom,
728 module_t *initrd, const char *kextra,
729 const char *loader)
730 {
731 struct xen_domctl_createdomain dom0_cfg = {
732 .flags = IS_ENABLED(CONFIG_TBOOT) ? XEN_DOMCTL_CDF_s3_integrity : 0,
733 .max_evtchn_port = -1,
734 .max_grant_frames = -1,
735 .max_maptrack_frames = -1,
736 .max_vcpus = dom0_max_vcpus(),
737 };
738 struct domain *d;
739 char *cmdline;
740
741 if ( opt_dom0_pvh )
742 {
743 dom0_cfg.flags |= (XEN_DOMCTL_CDF_hvm |
744 ((hvm_hap_supported() && !opt_dom0_shadow) ?
745 XEN_DOMCTL_CDF_hap : 0));
746
747 dom0_cfg.arch.emulation_flags |=
748 XEN_X86_EMU_LAPIC | XEN_X86_EMU_IOAPIC | XEN_X86_EMU_VPCI;
749 }
750
751 if ( iommu_enabled )
752 dom0_cfg.flags |= XEN_DOMCTL_CDF_iommu;
753
754 /* Create initial domain 0. */
755 d = domain_create(get_initial_domain_id(), &dom0_cfg, !pv_shim);
756 if ( IS_ERR(d) || (alloc_dom0_vcpu0(d) == NULL) )
757 panic("Error creating domain 0\n");
758
759 /* Grab the DOM0 command line. */
760 cmdline = image->string ? __va(image->string) : NULL;
761 if ( cmdline || kextra )
762 {
763 static char __initdata dom0_cmdline[MAX_GUEST_CMDLINE];
764
765 cmdline = cmdline_cook(cmdline, loader);
766 safe_strcpy(dom0_cmdline, cmdline);
767
768 if ( kextra )
769 /* kextra always includes exactly one leading space. */
770 safe_strcat(dom0_cmdline, kextra);
771
772 /* Append any extra parameters. */
773 if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
774 safe_strcat(dom0_cmdline, " noapic");
775 if ( (strlen(acpi_param) == 0) && acpi_disabled )
776 {
777 printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
778 safe_strcpy(acpi_param, "off");
779 }
780 if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
781 {
782 safe_strcat(dom0_cmdline, " acpi=");
783 safe_strcat(dom0_cmdline, acpi_param);
784 }
785
786 cmdline = dom0_cmdline;
787 }
788
789 /*
790 * Temporarily clear SMAP in CR4 to allow user-accesses in construct_dom0().
791 * This saves a large number of corner cases interactions with
792 * copy_from_user().
793 */
794 if ( cpu_has_smap )
795 {
796 cr4_pv32_mask &= ~X86_CR4_SMAP;
797 write_cr4(read_cr4() & ~X86_CR4_SMAP);
798 }
799
800 if ( construct_dom0(d, image, headroom, initrd, cmdline) != 0 )
801 panic("Could not construct domain 0\n");
802
803 if ( cpu_has_smap )
804 {
805 write_cr4(read_cr4() | X86_CR4_SMAP);
806 cr4_pv32_mask |= X86_CR4_SMAP;
807 }
808
809 return d;
810 }
811
812 /* How much of the directmap is prebuilt at compile time. */
813 #define PREBUILT_MAP_LIMIT (1 << L2_PAGETABLE_SHIFT)
814
__start_xen(unsigned long mbi_p)815 void __init noreturn __start_xen(unsigned long mbi_p)
816 {
817 char *memmap_type = NULL;
818 char *cmdline, *kextra, *loader;
819 unsigned int initrdidx, num_parked = 0;
820 multiboot_info_t *mbi;
821 module_t *mod;
822 unsigned long nr_pages, raw_max_page, modules_headroom, module_map[1];
823 int i, j, e820_warn = 0, bytes = 0;
824 bool acpi_boot_table_init_done = false, relocated = false;
825 int ret;
826 struct ns16550_defaults ns16550 = {
827 .data_bits = 8,
828 .parity = 'n',
829 .stop_bits = 1
830 };
831 const char *hypervisor_name;
832
833 /* Critical region without IDT or TSS. Any fault is deadly! */
834
835 init_shadow_spec_ctrl_state();
836
837 percpu_init_areas();
838
839 init_idt_traps();
840 load_system_tables();
841
842 smp_prepare_boot_cpu();
843 sort_exception_tables();
844
845 setup_virtual_regions(__start___ex_table, __stop___ex_table);
846
847 /* Full exception support from here on in. */
848
849 /* Enable NMIs. Our loader (e.g. Tboot) may have left them disabled. */
850 enable_nmis();
851
852 if ( pvh_boot )
853 {
854 ASSERT(mbi_p == 0);
855 pvh_init(&mbi, &mod);
856 }
857 else
858 {
859 mbi = __va(mbi_p);
860 mod = __va(mbi->mods_addr);
861 }
862
863 loader = (mbi->flags & MBI_LOADERNAME)
864 ? (char *)__va(mbi->boot_loader_name) : "unknown";
865
866 /* Parse the command-line options. */
867 cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
868 __va(mbi->cmdline) : NULL,
869 loader);
870 if ( (kextra = strstr(cmdline, " -- ")) != NULL )
871 {
872 /*
873 * Options after ' -- ' separator belong to dom0.
874 * 1. Orphan dom0's options from Xen's command line.
875 * 2. Skip all but final leading space from dom0's options.
876 */
877 *kextra = '\0';
878 kextra += 3;
879 while ( kextra[1] == ' ' ) kextra++;
880 }
881 cmdline_parse(cmdline);
882
883 /* Must be after command line argument parsing and before
884 * allocing any xenheap structures wanted in lower memory. */
885 kexec_early_calculations();
886
887 /*
888 * The probing has to be done _before_ initialising console,
889 * otherwise we couldn't set up Xen's PV console correctly.
890 */
891 hypervisor_name = hypervisor_probe();
892
893 parse_video_info();
894
895 rdmsrl(MSR_EFER, this_cpu(efer));
896 asm volatile ( "mov %%cr4,%0" : "=r" (get_cpu_info()->cr4) );
897
898 /* We initialise the serial devices very early so we can get debugging. */
899 ns16550.io_base = 0x3f8;
900 ns16550.irq = 4;
901 ns16550_init(0, &ns16550);
902 ns16550.io_base = 0x2f8;
903 ns16550.irq = 3;
904 ns16550_init(1, &ns16550);
905 ehci_dbgp_init();
906 console_init_preirq();
907
908 if ( pvh_boot )
909 pvh_print_info();
910
911 printk("Bootloader: %s\n", loader);
912
913 printk("Command line: %s\n", cmdline);
914
915 printk("Xen image load base address: %#lx\n", xen_phys_start);
916 if ( hypervisor_name )
917 printk("Running on %s\n", hypervisor_name);
918
919 #ifdef CONFIG_VIDEO
920 printk("Video information:\n");
921
922 /* Print VGA display mode information. */
923 switch ( vga_console_info.video_type )
924 {
925 case XEN_VGATYPE_TEXT_MODE_3:
926 printk(" VGA is text mode %dx%d, font 8x%d\n",
927 vga_console_info.u.text_mode_3.columns,
928 vga_console_info.u.text_mode_3.rows,
929 vga_console_info.u.text_mode_3.font_height);
930 break;
931 case XEN_VGATYPE_VESA_LFB:
932 case XEN_VGATYPE_EFI_LFB:
933 printk(" VGA is graphics mode %dx%d, %d bpp\n",
934 vga_console_info.u.vesa_lfb.width,
935 vga_console_info.u.vesa_lfb.height,
936 vga_console_info.u.vesa_lfb.bits_per_pixel);
937 break;
938 default:
939 printk(" No VGA detected\n");
940 break;
941 }
942
943 /* Print VBE/DDC EDID information. */
944 if ( bootsym(boot_edid_caps) != 0x1313 )
945 {
946 u16 caps = bootsym(boot_edid_caps);
947 printk(" VBE/DDC methods:%s%s%s; ",
948 (caps & 1) ? " V1" : "",
949 (caps & 2) ? " V2" : "",
950 !(caps & 3) ? " none" : "");
951 printk("EDID transfer time: %d seconds\n", caps >> 8);
952 if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
953 {
954 printk(" EDID info not retrieved because ");
955 if ( !(caps & 3) )
956 printk("no DDC retrieval method detected\n");
957 else if ( (caps >> 8) > 5 )
958 printk("takes longer than 5 seconds\n");
959 else
960 printk("of reasons unknown\n");
961 }
962 }
963 #endif
964
965 printk("Disc information:\n");
966 printk(" Found %d MBR signatures\n",
967 bootsym(boot_mbr_signature_nr));
968 printk(" Found %d EDD information structures\n",
969 bootsym(boot_edd_info_nr));
970
971 /* Check that we have at least one Multiboot module. */
972 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
973 panic("dom0 kernel not specified. Check bootloader configuration\n");
974
975 /* Check that we don't have a silly number of modules. */
976 if ( mbi->mods_count > sizeof(module_map) * 8 )
977 {
978 mbi->mods_count = sizeof(module_map) * 8;
979 printk("Excessive multiboot modules - using the first %u only\n",
980 mbi->mods_count);
981 }
982
983 bitmap_fill(module_map, mbi->mods_count);
984 __clear_bit(0, module_map); /* Dom0 kernel is always first */
985
986 if ( pvh_boot )
987 {
988 /* pvh_init() already filled in e820_raw */
989 memmap_type = "PVH-e820";
990 }
991 else if ( efi_enabled(EFI_LOADER) )
992 {
993 set_pdx_range(xen_phys_start >> PAGE_SHIFT,
994 (xen_phys_start + BOOTSTRAP_MAP_BASE) >> PAGE_SHIFT);
995
996 /* Clean up boot loader identity mappings. */
997 destroy_xen_mappings(xen_phys_start,
998 xen_phys_start + BOOTSTRAP_MAP_BASE);
999
1000 /* Make boot page tables match non-EFI boot. */
1001 l3_bootmap[l3_table_offset(BOOTSTRAP_MAP_BASE)] =
1002 l3e_from_paddr(__pa(l2_bootmap), __PAGE_HYPERVISOR);
1003
1004 memmap_type = loader;
1005 }
1006 else if ( efi_enabled(EFI_BOOT) )
1007 memmap_type = "EFI";
1008 else if ( (e820_raw.nr_map =
1009 copy_bios_e820(e820_raw.map,
1010 ARRAY_SIZE(e820_raw.map))) != 0 )
1011 {
1012 memmap_type = "Xen-e820";
1013 }
1014 else if ( mbi->flags & MBI_MEMMAP )
1015 {
1016 memmap_type = "Multiboot-e820";
1017 while ( bytes < mbi->mmap_length &&
1018 e820_raw.nr_map < ARRAY_SIZE(e820_raw.map) )
1019 {
1020 memory_map_t *map = __va(mbi->mmap_addr + bytes);
1021
1022 /*
1023 * This is a gross workaround for a BIOS bug. Some bootloaders do
1024 * not write e820 map entries into pre-zeroed memory. This is
1025 * okay if the BIOS fills in all fields of the map entry, but
1026 * some broken BIOSes do not bother to write the high word of
1027 * the length field if the length is smaller than 4GB. We
1028 * detect and fix this by flagging sections below 4GB that
1029 * appear to be larger than 4GB in size.
1030 */
1031 if ( (map->base_addr_high == 0) && (map->length_high != 0) )
1032 {
1033 if ( !e820_warn )
1034 {
1035 printk("WARNING: Buggy e820 map detected and fixed "
1036 "(truncated length fields).\n");
1037 e820_warn = 1;
1038 }
1039 map->length_high = 0;
1040 }
1041
1042 e820_raw.map[e820_raw.nr_map].addr =
1043 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
1044 e820_raw.map[e820_raw.nr_map].size =
1045 ((u64)map->length_high << 32) | (u64)map->length_low;
1046 e820_raw.map[e820_raw.nr_map].type = map->type;
1047 e820_raw.nr_map++;
1048
1049 bytes += map->size + 4;
1050 }
1051 }
1052 else if ( bootsym(lowmem_kb) )
1053 {
1054 memmap_type = "Xen-e801";
1055 e820_raw.map[0].addr = 0;
1056 e820_raw.map[0].size = bootsym(lowmem_kb) << 10;
1057 e820_raw.map[0].type = E820_RAM;
1058 e820_raw.map[1].addr = 0x100000;
1059 e820_raw.map[1].size = bootsym(highmem_kb) << 10;
1060 e820_raw.map[1].type = E820_RAM;
1061 e820_raw.nr_map = 2;
1062 }
1063 else if ( mbi->flags & MBI_MEMLIMITS )
1064 {
1065 memmap_type = "Multiboot-e801";
1066 e820_raw.map[0].addr = 0;
1067 e820_raw.map[0].size = mbi->mem_lower << 10;
1068 e820_raw.map[0].type = E820_RAM;
1069 e820_raw.map[1].addr = 0x100000;
1070 e820_raw.map[1].size = mbi->mem_upper << 10;
1071 e820_raw.map[1].type = E820_RAM;
1072 e820_raw.nr_map = 2;
1073 }
1074 else
1075 panic("Bootloader provided no memory information\n");
1076
1077 /* This must come before e820 code because it sets paddr_bits. */
1078 early_cpu_init();
1079
1080 /* Choose shadow stack early, to set infrastructure up appropriately. */
1081 if ( opt_xen_shstk && boot_cpu_has(X86_FEATURE_CET_SS) )
1082 {
1083 printk("Enabling Supervisor Shadow Stacks\n");
1084
1085 setup_force_cpu_cap(X86_FEATURE_XEN_SHSTK);
1086 #ifdef CONFIG_PV32
1087 if ( opt_pv32 )
1088 {
1089 opt_pv32 = 0;
1090 printk(" - Disabling PV32 due to Shadow Stacks\n");
1091 }
1092 #endif
1093 }
1094
1095 /* Sanitise the raw E820 map to produce a final clean version. */
1096 max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
1097
1098 if ( !efi_enabled(EFI_BOOT) && e820_raw.nr_map >= 1 )
1099 {
1100 /*
1101 * Supplement the heuristics in l1tf_calculations() by assuming that
1102 * anything referenced in the E820 may be cacheable.
1103 */
1104 l1tf_safe_maddr =
1105 max(l1tf_safe_maddr,
1106 ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr +
1107 e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE));
1108 }
1109
1110 /* Create a temporary copy of the E820 map. */
1111 memcpy(&boot_e820, &e820, sizeof(e820));
1112
1113 /* Early kexec reservation (explicit static start address). */
1114 nr_pages = 0;
1115 for ( i = 0; i < e820.nr_map; i++ )
1116 if ( e820.map[i].type == E820_RAM )
1117 nr_pages += e820.map[i].size >> PAGE_SHIFT;
1118 set_kexec_crash_area_size((u64)nr_pages << PAGE_SHIFT);
1119 kexec_reserve_area(&boot_e820);
1120
1121 initial_images = mod;
1122 nr_initial_images = mbi->mods_count;
1123
1124 for ( i = 0; !efi_enabled(EFI_LOADER) && i < mbi->mods_count; i++ )
1125 {
1126 if ( mod[i].mod_start & (PAGE_SIZE - 1) )
1127 panic("Bootloader didn't honor module alignment request\n");
1128 mod[i].mod_end -= mod[i].mod_start;
1129 mod[i].mod_start >>= PAGE_SHIFT;
1130 mod[i].reserved = 0;
1131 }
1132
1133 if ( xen_phys_start )
1134 {
1135 relocated = true;
1136
1137 /*
1138 * This needs to remain in sync with xen_in_range() and the
1139 * respective reserve_e820_ram() invocation below.
1140 */
1141 mod[mbi->mods_count].mod_start = virt_to_mfn(_stext);
1142 mod[mbi->mods_count].mod_end = __2M_rwdata_end - _stext;
1143 }
1144
1145 modules_headroom = bzimage_headroom(bootstrap_map(mod), mod->mod_end);
1146 bootstrap_map(NULL);
1147
1148 #ifndef highmem_start
1149 /* Don't allow split below 4Gb. */
1150 if ( highmem_start < GB(4) )
1151 highmem_start = 0;
1152 else /* align to L3 entry boundary */
1153 highmem_start &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
1154 #endif
1155
1156 /*
1157 * Iterate backwards over all superpage-aligned RAM regions.
1158 *
1159 * We require superpage alignment because the boot allocator is
1160 * not yet initialised. Hence we can only map superpages in the
1161 * address range PREBUILT_MAP_LIMIT to 4GB, as this is guaranteed
1162 * not to require dynamic allocation of pagetables.
1163 *
1164 * As well as mapping superpages in that range, in preparation for
1165 * initialising the boot allocator, we also look for a region to which
1166 * we can relocate the dom0 kernel and other multiboot modules. Also, on
1167 * x86/64, we relocate Xen to higher memory.
1168 */
1169 for ( i = boot_e820.nr_map-1; i >= 0; i-- )
1170 {
1171 uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
1172 uint64_t end, limit = ARRAY_SIZE(l2_directmap) << L2_PAGETABLE_SHIFT;
1173
1174 if ( boot_e820.map[i].type != E820_RAM )
1175 continue;
1176
1177 /* Superpage-aligned chunks from PREBUILT_MAP_LIMIT. */
1178 s = (boot_e820.map[i].addr + mask) & ~mask;
1179 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1180 s = max_t(uint64_t, s, PREBUILT_MAP_LIMIT);
1181 if ( s >= e )
1182 continue;
1183
1184 if ( s < limit )
1185 {
1186 end = min(e, limit);
1187 set_pdx_range(s >> PAGE_SHIFT, end >> PAGE_SHIFT);
1188 map_pages_to_xen((unsigned long)__va(s), maddr_to_mfn(s),
1189 PFN_DOWN(end - s), PAGE_HYPERVISOR);
1190 }
1191
1192 if ( e > min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
1193 1UL << (PAGE_SHIFT + 32)) )
1194 e = min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
1195 1UL << (PAGE_SHIFT + 32));
1196 #define reloc_size ((__pa(__2M_rwdata_end) + mask) & ~mask)
1197 /* Is the region suitable for relocating Xen? */
1198 if ( !xen_phys_start && e <= limit )
1199 {
1200 /* Don't overlap with modules. */
1201 end = consider_modules(s, e, reloc_size + mask,
1202 mod, mbi->mods_count, -1);
1203 end &= ~mask;
1204 }
1205 else
1206 end = 0;
1207
1208 /*
1209 * Is the region size greater than zero and does it begin
1210 * at or above the end of current Xen image placement?
1211 */
1212 if ( (end > s) && (end - reloc_size + XEN_IMG_OFFSET >= __pa(_end)) )
1213 {
1214 l4_pgentry_t *pl4e;
1215 l3_pgentry_t *pl3e;
1216 l2_pgentry_t *pl2e;
1217 int i, j, k;
1218 unsigned long pte_update_limit;
1219
1220 /* Select relocation address. */
1221 xen_phys_start = end - reloc_size;
1222 e = xen_phys_start + XEN_IMG_OFFSET;
1223 bootsym(trampoline_xen_phys_start) = xen_phys_start;
1224
1225 /*
1226 * No PTEs pointing above this address are candidates for relocation.
1227 * Due to possibility of partial overlap of the end of source image
1228 * and the beginning of region for destination image some PTEs may
1229 * point to addresses in range [e, e + XEN_IMG_OFFSET).
1230 */
1231 pte_update_limit = PFN_DOWN(e);
1232
1233 /*
1234 * Perform relocation to new physical address.
1235 * Before doing so we must sync static/global data with main memory
1236 * with a barrier(). After this we must *not* modify static/global
1237 * data until after we have switched to the relocated pagetables!
1238 */
1239 barrier();
1240 move_memory(e, XEN_IMG_OFFSET, _end - _start, 1);
1241
1242 /* Walk initial pagetables, relocating page directory entries. */
1243 pl4e = __va(__pa(idle_pg_table));
1244 for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
1245 {
1246 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
1247 continue;
1248 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
1249 xen_phys_start);
1250 pl3e = l4e_to_l3e(*pl4e);
1251 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
1252 {
1253 /* Not present, 1GB mapping, or already relocated? */
1254 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
1255 (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
1256 (l3e_get_pfn(*pl3e) >= pte_update_limit) )
1257 continue;
1258 *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
1259 xen_phys_start);
1260 pl2e = l3e_to_l2e(*pl3e);
1261 for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
1262 {
1263 /* Not present, PSE, or already relocated? */
1264 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
1265 (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
1266 (l2e_get_pfn(*pl2e) >= pte_update_limit) )
1267 continue;
1268 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
1269 xen_phys_start);
1270 }
1271 }
1272 }
1273
1274 /* The only data mappings to be relocated are in the Xen area. */
1275 pl2e = __va(__pa(l2_xenmap));
1276 /*
1277 * Undo the temporary-hooking of the l1_directmap. __2M_text_start
1278 * is contained in this PTE.
1279 */
1280 BUG_ON(using_2M_mapping() &&
1281 l2_table_offset((unsigned long)_erodata) ==
1282 l2_table_offset((unsigned long)_stext));
1283 *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
1284 PAGE_HYPERVISOR_RX | _PAGE_PSE);
1285 for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
1286 {
1287 unsigned int flags;
1288
1289 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
1290 (l2e_get_pfn(*pl2e) >= pte_update_limit) )
1291 continue;
1292
1293 if ( !using_2M_mapping() )
1294 {
1295 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
1296 xen_phys_start);
1297 continue;
1298 }
1299
1300 if ( i < l2_table_offset((unsigned long)&__2M_text_end) )
1301 {
1302 flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
1303 }
1304 else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
1305 i < l2_table_offset((unsigned long)&__2M_rodata_end) )
1306 {
1307 flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
1308 }
1309 else if ( i >= l2_table_offset((unsigned long)&__2M_init_start) &&
1310 i < l2_table_offset((unsigned long)&__2M_init_end) )
1311 {
1312 flags = PAGE_HYPERVISOR_RWX | _PAGE_PSE;
1313 }
1314 else if ( (i >= l2_table_offset((unsigned long)&__2M_rwdata_start) &&
1315 i < l2_table_offset((unsigned long)&__2M_rwdata_end)) )
1316 {
1317 flags = PAGE_HYPERVISOR_RW | _PAGE_PSE;
1318 }
1319 else
1320 {
1321 *pl2e = l2e_empty();
1322 continue;
1323 }
1324
1325 *pl2e = l2e_from_paddr(
1326 l2e_get_paddr(*pl2e) + xen_phys_start, flags);
1327 }
1328
1329 /* Re-sync the stack and then switch to relocated pagetables. */
1330 asm volatile (
1331 "rep movsq ; " /* re-sync the stack */
1332 "movq %%cr4,%%rsi ; "
1333 "andb $0x7f,%%sil ; "
1334 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
1335 "movq %[pg],%%cr3 ; " /* CR3 == new pagetables */
1336 "orb $0x80,%%sil ; "
1337 "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */
1338 : "=&S" (i), "=&D" (i), "=&c" (i) /* All outputs discarded. */
1339 : [pg] "r" (__pa(idle_pg_table)), "0" (cpu0_stack),
1340 "1" (__va(__pa(cpu0_stack))), "2" (STACK_SIZE / 8)
1341 : "memory" );
1342
1343 bootstrap_map(NULL);
1344
1345 printk("New Xen image base address: %#lx\n", xen_phys_start);
1346 }
1347
1348 /* Is the region suitable for relocating the multiboot modules? */
1349 for ( j = mbi->mods_count - 1; j >= 0; j-- )
1350 {
1351 unsigned long headroom = j ? 0 : modules_headroom;
1352 unsigned long size = PAGE_ALIGN(headroom + mod[j].mod_end);
1353
1354 if ( mod[j].reserved )
1355 continue;
1356
1357 /* Don't overlap with other modules (or Xen itself). */
1358 end = consider_modules(s, e, size, mod,
1359 mbi->mods_count + relocated, j);
1360
1361 if ( highmem_start && end > highmem_start )
1362 continue;
1363
1364 if ( s < end &&
1365 (headroom ||
1366 ((end - size) >> PAGE_SHIFT) > mod[j].mod_start) )
1367 {
1368 move_memory(end - size + headroom,
1369 (uint64_t)mod[j].mod_start << PAGE_SHIFT,
1370 mod[j].mod_end, 0);
1371 mod[j].mod_start = (end - size) >> PAGE_SHIFT;
1372 mod[j].mod_end += headroom;
1373 mod[j].reserved = 1;
1374 }
1375 }
1376
1377 #ifdef CONFIG_KEXEC
1378 /*
1379 * Looking backwards from the crash area limit, find a large
1380 * enough range that does not overlap with modules.
1381 */
1382 while ( !kexec_crash_area.start )
1383 {
1384 /* Don't overlap with modules (or Xen itself). */
1385 e = consider_modules(s, e, PAGE_ALIGN(kexec_crash_area.size), mod,
1386 mbi->mods_count + relocated, -1);
1387 if ( s >= e )
1388 break;
1389 if ( e > kexec_crash_area_limit )
1390 {
1391 e = kexec_crash_area_limit & PAGE_MASK;
1392 continue;
1393 }
1394 kexec_crash_area.start = (e - kexec_crash_area.size) & PAGE_MASK;
1395 }
1396 #endif
1397 }
1398
1399 if ( modules_headroom && !mod->reserved )
1400 panic("Not enough memory to relocate the dom0 kernel image\n");
1401 for ( i = 0; i < mbi->mods_count; ++i )
1402 {
1403 uint64_t s = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
1404
1405 reserve_e820_ram(&boot_e820, s, s + PAGE_ALIGN(mod[i].mod_end));
1406 }
1407
1408 if ( !xen_phys_start )
1409 panic("Not enough memory to relocate Xen\n");
1410
1411 /* This needs to remain in sync with xen_in_range(). */
1412 reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
1413
1414 /* Late kexec reservation (dynamic start address). */
1415 kexec_reserve_area(&boot_e820);
1416
1417 setup_max_pdx(raw_max_page);
1418 if ( highmem_start )
1419 xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
1420
1421 /*
1422 * Walk every RAM region and map it in its entirety (on x86/64, at least)
1423 * and notify it to the boot allocator.
1424 */
1425 for ( i = 0; i < boot_e820.nr_map; i++ )
1426 {
1427 uint64_t s, e, mask = PAGE_SIZE - 1;
1428 uint64_t map_s, map_e;
1429
1430 if ( boot_e820.map[i].type != E820_RAM )
1431 continue;
1432
1433 /* Only page alignment required now. */
1434 s = (boot_e820.map[i].addr + mask) & ~mask;
1435 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1436 s = max_t(uint64_t, s, 1<<20);
1437 if ( s >= e )
1438 continue;
1439
1440 if ( !acpi_boot_table_init_done &&
1441 s >= (1ULL << 32) &&
1442 !acpi_boot_table_init() )
1443 {
1444 acpi_boot_table_init_done = true;
1445 srat_parse_regions(s);
1446 setup_max_pdx(raw_max_page);
1447 }
1448
1449 if ( pfn_to_pdx((e - 1) >> PAGE_SHIFT) >= max_pdx )
1450 {
1451 if ( pfn_to_pdx(s >> PAGE_SHIFT) >= max_pdx )
1452 {
1453 for ( j = i - 1; ; --j )
1454 {
1455 if ( boot_e820.map[j].type == E820_RAM )
1456 break;
1457 ASSERT(j);
1458 }
1459 map_e = boot_e820.map[j].addr + boot_e820.map[j].size;
1460 for ( j = 0; j < mbi->mods_count; ++j )
1461 {
1462 uint64_t end = pfn_to_paddr(mod[j].mod_start) +
1463 mod[j].mod_end;
1464
1465 if ( map_e < end )
1466 map_e = end;
1467 }
1468 if ( PFN_UP(map_e) < max_page )
1469 {
1470 max_page = PFN_UP(map_e);
1471 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1472 }
1473 printk(XENLOG_WARNING "Ignoring inaccessible memory range"
1474 " %013"PRIx64"-%013"PRIx64"\n",
1475 s, e);
1476 continue;
1477 }
1478 map_e = e;
1479 e = (pdx_to_pfn(max_pdx - 1) + 1ULL) << PAGE_SHIFT;
1480 printk(XENLOG_WARNING "Ignoring inaccessible memory range"
1481 " %013"PRIx64"-%013"PRIx64"\n",
1482 e, map_e);
1483 }
1484
1485 set_pdx_range(s >> PAGE_SHIFT, e >> PAGE_SHIFT);
1486
1487 /* Need to create mappings above PREBUILT_MAP_LIMIT. */
1488 map_s = max_t(uint64_t, s, PREBUILT_MAP_LIMIT);
1489 map_e = min_t(uint64_t, e,
1490 ARRAY_SIZE(l2_directmap) << L2_PAGETABLE_SHIFT);
1491
1492 /* Pass mapped memory to allocator /before/ creating new mappings. */
1493 init_boot_pages(s, min(map_s, e));
1494 s = map_s;
1495 if ( s < map_e )
1496 {
1497 uint64_t mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
1498
1499 map_s = (s + mask) & ~mask;
1500 map_e &= ~mask;
1501 init_boot_pages(map_s, map_e);
1502 }
1503
1504 if ( map_s > map_e )
1505 map_s = map_e = s;
1506
1507 /* Create new mappings /before/ passing memory to the allocator. */
1508 if ( map_e < e )
1509 {
1510 uint64_t limit = __pa(HYPERVISOR_VIRT_END - 1) + 1;
1511 uint64_t end = min(e, limit);
1512
1513 if ( map_e < end )
1514 {
1515 map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
1516 PFN_DOWN(end - map_e), PAGE_HYPERVISOR);
1517 init_boot_pages(map_e, end);
1518 map_e = end;
1519 }
1520 }
1521 if ( map_e < e )
1522 {
1523 /* This range must not be passed to the boot allocator and
1524 * must also not be mapped with _PAGE_GLOBAL. */
1525 map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
1526 PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
1527 }
1528 if ( s < map_s )
1529 {
1530 map_pages_to_xen((unsigned long)__va(s), maddr_to_mfn(s),
1531 PFN_DOWN(map_s - s), PAGE_HYPERVISOR);
1532 init_boot_pages(s, map_s);
1533 }
1534 }
1535
1536 for ( i = 0; i < mbi->mods_count; ++i )
1537 {
1538 set_pdx_range(mod[i].mod_start,
1539 mod[i].mod_start + PFN_UP(mod[i].mod_end));
1540 map_pages_to_xen((unsigned long)mfn_to_virt(mod[i].mod_start),
1541 _mfn(mod[i].mod_start),
1542 PFN_UP(mod[i].mod_end), PAGE_HYPERVISOR);
1543 }
1544
1545 #ifdef CONFIG_KEXEC
1546 if ( kexec_crash_area.size )
1547 {
1548 unsigned long s = PFN_DOWN(kexec_crash_area.start);
1549 unsigned long e = min(s + PFN_UP(kexec_crash_area.size),
1550 PFN_UP(__pa(HYPERVISOR_VIRT_END - 1)));
1551
1552 if ( e > s )
1553 map_pages_to_xen((unsigned long)__va(kexec_crash_area.start),
1554 _mfn(s), e - s, PAGE_HYPERVISOR);
1555 }
1556 #endif
1557
1558 xen_virt_end = ((unsigned long)_end + (1UL << L2_PAGETABLE_SHIFT) - 1) &
1559 ~((1UL << L2_PAGETABLE_SHIFT) - 1);
1560 destroy_xen_mappings(xen_virt_end, XEN_VIRT_START + BOOTSTRAP_MAP_BASE);
1561
1562 /*
1563 * If not using 2M mappings to gain suitable pagetable permissions
1564 * directly from the relocation above, remap the code/data
1565 * sections with decreased permissions.
1566 */
1567 if ( !using_2M_mapping() )
1568 {
1569 /* Mark .text as RX (avoiding the first 2M superpage). */
1570 modify_xen_mappings(XEN_VIRT_START + MB(2),
1571 (unsigned long)&__2M_text_end,
1572 PAGE_HYPERVISOR_RX);
1573
1574 /* Mark .rodata as RO. */
1575 modify_xen_mappings((unsigned long)&__2M_rodata_start,
1576 (unsigned long)&__2M_rodata_end,
1577 PAGE_HYPERVISOR_RO);
1578
1579 /* Mark .data and .bss as RW. */
1580 modify_xen_mappings((unsigned long)&__2M_rwdata_start,
1581 (unsigned long)&__2M_rwdata_end,
1582 PAGE_HYPERVISOR_RW);
1583
1584 /* Drop the remaining mappings in the shattered superpage. */
1585 destroy_xen_mappings((unsigned long)&__2M_rwdata_end,
1586 ROUNDUP((unsigned long)&__2M_rwdata_end, MB(2)));
1587 }
1588
1589 nr_pages = 0;
1590 for ( i = 0; i < e820.nr_map; i++ )
1591 if ( e820.map[i].type == E820_RAM )
1592 nr_pages += e820.map[i].size >> PAGE_SHIFT;
1593 printk("System RAM: %luMB (%lukB)\n",
1594 nr_pages >> (20 - PAGE_SHIFT),
1595 nr_pages << (PAGE_SHIFT - 10));
1596 total_pages = nr_pages;
1597
1598 /* Sanity check for unwanted bloat of certain hypercall structures. */
1599 BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
1600 sizeof(((struct xen_platform_op *)0)->u.pad));
1601 BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
1602 sizeof(((struct xen_domctl *)0)->u.pad));
1603 BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
1604 sizeof(((struct xen_sysctl *)0)->u.pad));
1605
1606 BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
1607 BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
1608 BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
1609
1610 BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
1611 sizeof(((struct compat_platform_op *)0)->u.pad));
1612 BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
1613 BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
1614
1615 /* Check definitions in public headers match internal defs. */
1616 BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
1617 BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
1618 BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
1619 BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
1620
1621 init_frametable();
1622
1623 if ( !acpi_boot_table_init_done )
1624 acpi_boot_table_init();
1625
1626 acpi_numa_init();
1627
1628 numa_initmem_init(0, raw_max_page);
1629
1630 if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
1631 {
1632 unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
1633 uint64_t mask = PAGE_SIZE - 1;
1634
1635 if ( !highmem_start )
1636 xenheap_max_mfn(limit);
1637
1638 end_boot_allocator();
1639
1640 /* Pass the remaining memory to the allocator. */
1641 for ( i = 0; i < boot_e820.nr_map; i++ )
1642 {
1643 uint64_t s, e;
1644
1645 if ( boot_e820.map[i].type != E820_RAM )
1646 continue;
1647 s = (boot_e820.map[i].addr + mask) & ~mask;
1648 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1649 if ( PFN_DOWN(e) <= limit )
1650 continue;
1651 if ( PFN_DOWN(s) <= limit )
1652 s = pfn_to_paddr(limit + 1);
1653 init_domheap_pages(s, e);
1654 }
1655 }
1656 else
1657 end_boot_allocator();
1658
1659 system_state = SYS_STATE_boot;
1660 /*
1661 * No calls involving ACPI code should go between the setting of
1662 * SYS_STATE_boot and vm_init() (or else acpi_os_{,un}map_memory()
1663 * will break).
1664 */
1665 vm_init();
1666
1667 console_init_ring();
1668 vesa_init();
1669
1670 tasklet_subsys_init();
1671
1672 paging_init();
1673
1674 tboot_probe();
1675
1676 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
1677
1678 if ( opt_watchdog )
1679 nmi_watchdog = NMI_LOCAL_APIC;
1680
1681 find_smp_config();
1682
1683 dmi_scan_machine();
1684
1685 generic_apic_probe();
1686
1687 mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
1688 RANGESETF_prettyprint_hex);
1689
1690 xsm_multiboot_init(module_map, mbi);
1691
1692 setup_system_domains();
1693
1694 acpi_boot_init();
1695
1696 if ( smp_found_config )
1697 get_smp_config();
1698
1699 /*
1700 * In the shim case, the number of CPUs should be solely controlled by the
1701 * guest configuration file.
1702 */
1703 if ( pv_shim )
1704 {
1705 opt_nosmp = false;
1706 max_cpus = 0;
1707 }
1708 if ( opt_nosmp )
1709 {
1710 max_cpus = 0;
1711 set_nr_cpu_ids(1);
1712 }
1713 else
1714 {
1715 set_nr_cpu_ids(max_cpus);
1716 if ( !max_cpus )
1717 max_cpus = nr_cpu_ids;
1718 }
1719
1720 if ( hypervisor_name )
1721 hypervisor_setup();
1722
1723 /* Low mappings were only needed for some BIOS table parsing. */
1724 zap_low_mappings();
1725
1726 init_apic_mappings();
1727
1728 normalise_cpu_order();
1729
1730 init_cpu_to_node();
1731
1732 x2apic_bsp_setup();
1733
1734 ret = init_irq_data();
1735 if ( ret < 0 )
1736 panic("Error %d setting up IRQ data\n", ret);
1737
1738 console_init_irq();
1739
1740 init_IRQ();
1741
1742 microcode_grab_module(module_map, mbi);
1743
1744 timer_init();
1745
1746 early_microcode_init();
1747
1748 tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
1749
1750 identify_cpu(&boot_cpu_data);
1751
1752 set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
1753
1754 /* Do not enable SMEP/SMAP in PV shim on AMD and Hygon by default */
1755 if ( opt_smep == -1 )
1756 opt_smep = !pv_shim || !(boot_cpu_data.x86_vendor &
1757 (X86_VENDOR_AMD | X86_VENDOR_HYGON));
1758 if ( opt_smap == -1 )
1759 opt_smap = !pv_shim || !(boot_cpu_data.x86_vendor &
1760 (X86_VENDOR_AMD | X86_VENDOR_HYGON));
1761
1762 if ( !opt_smep )
1763 setup_clear_cpu_cap(X86_FEATURE_SMEP);
1764 if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY )
1765 setup_force_cpu_cap(X86_FEATURE_XEN_SMEP);
1766 if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) )
1767 set_in_cr4(X86_CR4_SMEP);
1768
1769 if ( !opt_smap )
1770 setup_clear_cpu_cap(X86_FEATURE_SMAP);
1771 if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY )
1772 setup_force_cpu_cap(X86_FEATURE_XEN_SMAP);
1773 if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
1774 set_in_cr4(X86_CR4_SMAP);
1775
1776 cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
1777
1778 if ( boot_cpu_has(X86_FEATURE_FSGSBASE) )
1779 set_in_cr4(X86_CR4_FSGSBASE);
1780
1781 if ( opt_invpcid && cpu_has_invpcid )
1782 use_invpcid = true;
1783
1784 init_speculation_mitigations();
1785
1786 init_idle_domain();
1787
1788 this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
1789 &this_cpu(stubs).mfn);
1790 BUG_ON(!this_cpu(stubs.addr));
1791
1792 trap_init();
1793
1794 rcu_init();
1795
1796 early_time_init();
1797
1798 arch_init_memory();
1799
1800 alternative_instructions();
1801
1802 local_irq_enable();
1803
1804 vesa_mtrr_init();
1805
1806 early_msi_init();
1807
1808 iommu_setup(); /* setup iommu if available */
1809
1810 smp_prepare_cpus();
1811
1812 spin_debug_enable();
1813
1814 /*
1815 * Initialise higher-level timer functions. We do this fairly late
1816 * (after interrupts got enabled) because the time bases and scale
1817 * factors need to be updated regularly.
1818 */
1819 init_xen_time();
1820
1821 initialize_keytable();
1822
1823 console_init_postirq();
1824
1825 system_state = SYS_STATE_smp_boot;
1826
1827 do_presmp_initcalls();
1828
1829 alternative_branches();
1830
1831 /* Defer CR4.CET until alternatives have finished playing with CR0.WP */
1832 if ( cpu_has_xen_shstk )
1833 set_in_cr4(X86_CR4_CET);
1834
1835 /*
1836 * NB: when running as a PV shim VCPUOP_up/down is wired to the shim
1837 * physical cpu_add/remove functions, so launch the guest with only
1838 * the BSP online and let it bring up the other CPUs as required.
1839 */
1840 if ( !pv_shim )
1841 {
1842 for_each_present_cpu ( i )
1843 {
1844 /* Set up cpu_to_node[]. */
1845 srat_detect_node(i);
1846 /* Set up node_to_cpumask based on cpu_to_node[]. */
1847 numa_add_cpu(i);
1848
1849 if ( (park_offline_cpus || num_online_cpus() < max_cpus) &&
1850 !cpu_online(i) )
1851 {
1852 ret = cpu_up(i);
1853 if ( ret != 0 )
1854 printk("Failed to bring up CPU %u (error %d)\n", i, ret);
1855 else if ( num_online_cpus() > max_cpus ||
1856 (!opt_smt &&
1857 cpu_data[i].compute_unit_id == INVALID_CUID &&
1858 cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) )
1859 {
1860 ret = cpu_down(i);
1861 if ( !ret )
1862 ++num_parked;
1863 else
1864 printk("Could not re-offline CPU%u (%d)\n", i, ret);
1865 }
1866 }
1867 }
1868 }
1869
1870 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
1871 if ( num_parked )
1872 printk(XENLOG_INFO "Parked %u CPUs\n", num_parked);
1873 smp_cpus_done();
1874
1875 do_initcalls();
1876
1877 if ( opt_watchdog )
1878 watchdog_setup();
1879
1880 if ( !tboot_protect_mem_regions() )
1881 panic("Could not protect TXT memory regions\n");
1882
1883 init_guest_cpuid();
1884 init_guest_msr_policy();
1885
1886 if ( xen_cpuidle )
1887 xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
1888
1889 printk("%sNX (Execute Disable) protection %sactive\n",
1890 cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
1891 cpu_has_nx ? "" : "not ");
1892
1893 initrdidx = find_first_bit(module_map, mbi->mods_count);
1894 if ( bitmap_weight(module_map, mbi->mods_count) > 1 )
1895 printk(XENLOG_WARNING
1896 "Multiple initrd candidates, picking module #%u\n",
1897 initrdidx);
1898
1899 /*
1900 * We're going to setup domain0 using the module(s) that we stashed safely
1901 * above our heap. The second module, if present, is an initrd ramdisk.
1902 */
1903 dom0 = create_dom0(mod, modules_headroom,
1904 initrdidx < mbi->mods_count ? mod + initrdidx : NULL,
1905 kextra, loader);
1906 if ( !dom0 )
1907 panic("Could not set up DOM0 guest OS\n");
1908
1909 heap_init_late();
1910
1911 init_trace_bufs();
1912
1913 init_constructors();
1914
1915 console_endboot();
1916
1917 /* Hide UART from DOM0 if we're using it */
1918 serial_endboot();
1919
1920 dmi_end_boot();
1921
1922 setup_io_bitmap(dom0);
1923
1924 if ( bsp_delay_spec_ctrl )
1925 {
1926 get_cpu_info()->spec_ctrl_flags &= ~SCF_use_shadow;
1927 barrier();
1928 wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
1929 }
1930
1931 /* Jump to the 1:1 virtual mappings of cpu0_stack. */
1932 asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
1933 [stk] "g" (__va(__pa(get_stack_bottom()))),
1934 [fn] "i" (reinit_bsp_stack) : "memory");
1935 unreachable();
1936 }
1937
arch_get_xen_caps(xen_capabilities_info_t * info)1938 void arch_get_xen_caps(xen_capabilities_info_t *info)
1939 {
1940 /* Interface name is always xen-3.0-* for Xen-3.x. */
1941 int major = 3, minor = 0;
1942 char s[32];
1943
1944 (*info)[0] = '\0';
1945
1946 if ( IS_ENABLED(CONFIG_PV) )
1947 {
1948 snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1949 safe_strcat(*info, s);
1950
1951 if ( opt_pv32 )
1952 {
1953 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1954 safe_strcat(*info, s);
1955 }
1956 }
1957 if ( hvm_enabled )
1958 {
1959 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1960 safe_strcat(*info, s);
1961 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1962 safe_strcat(*info, s);
1963 snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1964 safe_strcat(*info, s);
1965 }
1966 }
1967
xen_in_range(unsigned long mfn)1968 int __hwdom_init xen_in_range(unsigned long mfn)
1969 {
1970 paddr_t start, end;
1971 int i;
1972
1973 enum { region_s3, region_ro, region_rw, nr_regions };
1974 static struct {
1975 paddr_t s, e;
1976 } xen_regions[nr_regions] __hwdom_initdata;
1977
1978 /* initialize first time */
1979 if ( !xen_regions[0].s )
1980 {
1981 /* S3 resume code (and other real mode trampoline code) */
1982 xen_regions[region_s3].s = bootsym_phys(trampoline_start);
1983 xen_regions[region_s3].e = bootsym_phys(trampoline_end);
1984
1985 /*
1986 * This needs to remain in sync with the uses of the same symbols in
1987 * - __start_xen() (above)
1988 * - is_xen_fixed_mfn()
1989 * - tboot_shutdown()
1990 */
1991
1992 /* hypervisor .text + .rodata */
1993 xen_regions[region_ro].s = __pa(&_stext);
1994 xen_regions[region_ro].e = __pa(&__2M_rodata_end);
1995 /* hypervisor .data + .bss */
1996 xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
1997 xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
1998 }
1999
2000 start = (paddr_t)mfn << PAGE_SHIFT;
2001 end = start + PAGE_SIZE;
2002 for ( i = 0; i < nr_regions; i++ )
2003 if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
2004 return 1;
2005
2006 return 0;
2007 }
2008
io_bitmap_cb(unsigned long s,unsigned long e,void * ctx)2009 static int __hwdom_init io_bitmap_cb(unsigned long s, unsigned long e,
2010 void *ctx)
2011 {
2012 struct domain *d = ctx;
2013 unsigned int i;
2014
2015 ASSERT(e <= INT_MAX);
2016 for ( i = s; i <= e; i++ )
2017 __clear_bit(i, d->arch.hvm.io_bitmap);
2018
2019 return 0;
2020 }
2021
setup_io_bitmap(struct domain * d)2022 void __hwdom_init setup_io_bitmap(struct domain *d)
2023 {
2024 int rc;
2025
2026 if ( is_hvm_domain(d) )
2027 {
2028 bitmap_fill(d->arch.hvm.io_bitmap, 0x10000);
2029 rc = rangeset_report_ranges(d->arch.ioport_caps, 0, 0x10000,
2030 io_bitmap_cb, d);
2031 BUG_ON(rc);
2032 /*
2033 * NB: we need to trap accesses to 0xcf8 in order to intercept
2034 * 4 byte accesses, that need to be handled by Xen in order to
2035 * keep consistency.
2036 * Access to 1 byte RTC ports also needs to be trapped in order
2037 * to keep consistency with PV.
2038 */
2039 __set_bit(0xcf8, d->arch.hvm.io_bitmap);
2040 __set_bit(RTC_PORT(0), d->arch.hvm.io_bitmap);
2041 __set_bit(RTC_PORT(1), d->arch.hvm.io_bitmap);
2042 }
2043 }
2044
2045 /*
2046 * Local variables:
2047 * mode: C
2048 * c-file-style: "BSD"
2049 * c-basic-offset: 4
2050 * tab-width: 4
2051 * indent-tabs-mode: nil
2052 * End:
2053 */
2054