1 #include <xen/init.h>
2 #include <xen/lib.h>
3 #include <xen/err.h>
4 #include <xen/grant_table.h>
5 #include <xen/param.h>
6 #include <xen/sched.h>
7 #include <xen/domain.h>
8 #include <xen/serial.h>
9 #include <xen/softirq.h>
10 #include <xen/acpi.h>
11 #include <xen/efi.h>
12 #include <xen/console.h>
13 #include <xen/serial.h>
14 #include <xen/trace.h>
15 #include <xen/multiboot.h>
16 #include <xen/domain_page.h>
17 #include <xen/version.h>
18 #include <xen/gdbstub.h>
19 #include <xen/hypercall.h>
20 #include <xen/keyhandler.h>
21 #include <xen/numa.h>
22 #include <xen/rcupdate.h>
23 #include <xen/vga.h>
24 #include <xen/dmi.h>
25 #include <xen/pfn.h>
26 #include <xen/nodemask.h>
27 #include <xen/virtual_region.h>
28 #include <xen/watchdog.h>
29 #include <public/version.h>
30 #include <compat/platform.h>
31 #include <compat/xen.h>
32 #include <xen/bitops.h>
33 #include <asm/smp.h>
34 #include <asm/processor.h>
35 #include <asm/mpspec.h>
36 #include <asm/apic.h>
37 #include <asm/msi.h>
38 #include <asm/desc.h>
39 #include <asm/paging.h>
40 #include <asm/e820.h>
41 #include <xen/kexec.h>
42 #include <asm/edd.h>
43 #include <xsm/xsm.h>
44 #include <asm/tboot.h>
45 #include <asm/bzimage.h> /* for bzimage_headroom */
46 #include <asm/mach-generic/mach_apic.h> /* for generic_apic_probe */
47 #include <asm/setup.h>
48 #include <xen/cpu.h>
49 #include <asm/nmi.h>
50 #include <asm/alternative.h>
51 #include <asm/mc146818rtc.h>
52 #include <asm/cpuid.h>
53 #include <asm/spec_ctrl.h>
54 #include <asm/guest.h>
55 #include <asm/microcode.h>
56 #include <asm/pv/domain.h>
57 
58 /* opt_nosmp: If true, secondary processors are ignored. */
59 static bool __initdata opt_nosmp;
60 boolean_param("nosmp", opt_nosmp);
61 
62 /* maxcpus: maximum number of CPUs to activate. */
63 static unsigned int __initdata max_cpus;
64 integer_param("maxcpus", max_cpus);
65 
66 int8_t __read_mostly opt_smt = -1;
67 boolean_param("smt", opt_smt);
68 
69 /* opt_invpcid: If false, don't use INVPCID instruction even if available. */
70 static bool __initdata opt_invpcid = true;
71 boolean_param("invpcid", opt_invpcid);
72 bool __read_mostly use_invpcid;
73 
74 unsigned long __read_mostly cr4_pv32_mask;
75 
76 /* **** Linux config option: propagated to domain0. */
77 /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
78 /* "acpi=force":  Override the disable blacklist.                   */
79 /* "acpi=ht":     Limit ACPI just to boot-time to enable HT.        */
80 /* "acpi=noirq":  Disables ACPI interrupt routing.                  */
81 static int parse_acpi_param(const char *s);
82 custom_param("acpi", parse_acpi_param);
83 
84 /* **** Linux config option: propagated to domain0. */
85 /* noapic: Disable IOAPIC setup. */
86 boolean_param("noapic", skip_ioapic_setup);
87 
88 /* **** Linux config option: propagated to domain0. */
89 /* xen_cpuidle: xen control cstate. */
90 s8 __read_mostly xen_cpuidle = -1;
91 boolean_param("cpuidle", xen_cpuidle);
92 
93 #ifndef NDEBUG
94 unsigned long __initdata highmem_start;
95 size_param("highmem-start", highmem_start);
96 #endif
97 
98 #ifdef CONFIG_XEN_SHSTK
99 static bool __initdata opt_xen_shstk = true;
100 #else
101 #define opt_xen_shstk false
102 #endif
103 
parse_cet(const char * s)104 static int __init parse_cet(const char *s)
105 {
106     const char *ss;
107     int val, rc = 0;
108 
109     do {
110         ss = strchr(s, ',');
111         if ( !ss )
112             ss = strchr(s, '\0');
113 
114         if ( (val = parse_boolean("shstk", s, ss)) >= 0 )
115         {
116 #ifdef CONFIG_XEN_SHSTK
117             opt_xen_shstk = val;
118 #else
119             no_config_param("XEN_SHSTK", "cet", s, ss);
120 #endif
121         }
122         else
123             rc = -EINVAL;
124 
125         s = ss + 1;
126     } while ( *ss );
127 
128     return rc;
129 }
130 custom_param("cet", parse_cet);
131 
132 cpumask_t __read_mostly cpu_present_map;
133 
134 unsigned long __read_mostly xen_phys_start;
135 
136 unsigned long __read_mostly xen_virt_end;
137 
138 char __section(".bss.stack_aligned") __aligned(STACK_SIZE)
139     cpu0_stack[STACK_SIZE];
140 
141 struct cpuinfo_x86 __read_mostly boot_cpu_data = { 0, 0, 0, 0, -1 };
142 
143 unsigned long __read_mostly mmu_cr4_features = XEN_MINIMAL_CR4;
144 
145 /* smep: Enable/disable Supervisor Mode Execution Protection */
146 #define SMEP_HVM_ONLY (-2)
147 static s8 __initdata opt_smep = -1;
148 
149 /*
150  * Initial domain place holder. Needs to be global so it can be created in
151  * __start_xen and unpaused in init_done.
152  */
153 static struct domain *__initdata dom0;
154 
parse_smep_param(const char * s)155 static int __init parse_smep_param(const char *s)
156 {
157     if ( !*s )
158     {
159         opt_smep = 1;
160         return 0;
161     }
162 
163     switch ( parse_bool(s, NULL) )
164     {
165     case 0:
166         opt_smep = 0;
167         return 0;
168     case 1:
169         opt_smep = 1;
170         return 0;
171     }
172 
173     if ( !strcmp(s, "hvm") )
174         opt_smep = SMEP_HVM_ONLY;
175     else
176         return -EINVAL;
177 
178     return 0;
179 }
180 custom_param("smep", parse_smep_param);
181 
182 /* smap: Enable/disable Supervisor Mode Access Prevention */
183 #define SMAP_HVM_ONLY (-2)
184 static s8 __initdata opt_smap = -1;
185 
parse_smap_param(const char * s)186 static int __init parse_smap_param(const char *s)
187 {
188     if ( !*s )
189     {
190         opt_smap = 1;
191         return 0;
192     }
193 
194     switch ( parse_bool(s, NULL) )
195     {
196     case 0:
197         opt_smap = 0;
198         return 0;
199     case 1:
200         opt_smap = 1;
201         return 0;
202     }
203 
204     if ( !strcmp(s, "hvm") )
205         opt_smap = SMAP_HVM_ONLY;
206     else
207         return -EINVAL;
208 
209     return 0;
210 }
211 custom_param("smap", parse_smap_param);
212 
213 bool __read_mostly acpi_disabled;
214 bool __initdata acpi_force;
215 static char __initdata acpi_param[10] = "";
216 
parse_acpi_param(const char * s)217 static int __init parse_acpi_param(const char *s)
218 {
219     /* Save the parameter so it can be propagated to domain0. */
220     safe_strcpy(acpi_param, s);
221 
222     /* Interpret the parameter for use within Xen. */
223     if ( !parse_bool(s, NULL) )
224     {
225         disable_acpi();
226     }
227     else if ( !strcmp(s, "force") )
228     {
229         acpi_force = true;
230         acpi_ht = 1;
231         acpi_disabled = false;
232     }
233     else if ( !strcmp(s, "ht") )
234     {
235         if ( !acpi_force )
236             disable_acpi();
237         acpi_ht = 1;
238     }
239     else if ( !strcmp(s, "noirq") )
240     {
241         acpi_noirq_set();
242     }
243     else
244         return -EINVAL;
245 
246     return 0;
247 }
248 
249 static const module_t *__initdata initial_images;
250 static unsigned int __initdata nr_initial_images;
251 
initial_images_nrpages(nodeid_t node)252 unsigned long __init initial_images_nrpages(nodeid_t node)
253 {
254     unsigned long node_start = node_start_pfn(node);
255     unsigned long node_end = node_end_pfn(node);
256     unsigned long nr;
257     unsigned int i;
258 
259     for ( nr = i = 0; i < nr_initial_images; ++i )
260     {
261         unsigned long start = initial_images[i].mod_start;
262         unsigned long end = start + PFN_UP(initial_images[i].mod_end);
263 
264         if ( end > node_start && node_end > start )
265             nr += min(node_end, end) - max(node_start, start);
266     }
267 
268     return nr;
269 }
270 
discard_initial_images(void)271 void __init discard_initial_images(void)
272 {
273     unsigned int i;
274 
275     for ( i = 0; i < nr_initial_images; ++i )
276     {
277         uint64_t start = (uint64_t)initial_images[i].mod_start << PAGE_SHIFT;
278 
279         init_domheap_pages(start,
280                            start + PAGE_ALIGN(initial_images[i].mod_end));
281     }
282 
283     nr_initial_images = 0;
284     initial_images = NULL;
285 }
286 
287 extern char __init_begin[], __init_end[], __bss_start[], __bss_end[];
288 
init_idle_domain(void)289 static void __init init_idle_domain(void)
290 {
291     scheduler_init();
292     set_current(idle_vcpu[0]);
293     this_cpu(curr_vcpu) = current;
294 }
295 
srat_detect_node(int cpu)296 void srat_detect_node(int cpu)
297 {
298     nodeid_t node;
299     u32 apicid = x86_cpu_to_apicid[cpu];
300 
301     node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
302     if ( node == NUMA_NO_NODE )
303         node = 0;
304 
305     node_set_online(node);
306     numa_set_node(cpu, node);
307 
308     if ( opt_cpu_info && acpi_numa > 0 )
309         printk("CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
310 }
311 
312 /*
313  * Sort CPUs by <node,package,core,thread> tuple. Fortunately this hierarchy is
314  * reflected in the structure of modern APIC identifiers, so we sort based on
315  * those. This is slightly complicated by the fact that the BSP must remain
316  * CPU 0. Hence we do a variation on longest-prefix matching to do the best we
317  * can while keeping CPU 0 static.
318  */
normalise_cpu_order(void)319 static void __init normalise_cpu_order(void)
320 {
321     unsigned int i, j, min_cpu;
322     uint32_t apicid, diff, min_diff;
323 
324     for_each_present_cpu ( i )
325     {
326         apicid = x86_cpu_to_apicid[i];
327         min_diff = min_cpu = ~0u;
328 
329         /*
330          * Find remaining CPU with longest-prefix match on APIC ID.
331          * Among identical longest-prefix matches, pick the smallest APIC ID.
332          */
333         for ( j = cpumask_next(i, &cpu_present_map);
334               j < nr_cpu_ids;
335               j = cpumask_next(j, &cpu_present_map) )
336         {
337             diff = x86_cpu_to_apicid[j] ^ apicid;
338             while ( diff & (diff-1) )
339                 diff &= diff-1;
340             if ( (diff < min_diff) ||
341                  ((diff == min_diff) &&
342                   (x86_cpu_to_apicid[j] < x86_cpu_to_apicid[min_cpu])) )
343             {
344                 min_diff = diff;
345                 min_cpu = j;
346             }
347         }
348 
349         /* If no match then there must be no CPUs remaining to consider. */
350         if ( min_cpu >= nr_cpu_ids )
351         {
352             BUG_ON(cpumask_next(i, &cpu_present_map) < nr_cpu_ids);
353             break;
354         }
355 
356         /* Switch the best-matching CPU with the next CPU in logical order. */
357         j = cpumask_next(i, &cpu_present_map);
358         apicid = x86_cpu_to_apicid[min_cpu];
359         x86_cpu_to_apicid[min_cpu] = x86_cpu_to_apicid[j];
360         x86_cpu_to_apicid[j] = apicid;
361     }
362 }
363 
364 #define BOOTSTRAP_MAP_BASE  (16UL << 20)
365 #define BOOTSTRAP_MAP_LIMIT (1UL << L3_PAGETABLE_SHIFT)
366 
367 /*
368  * Ensure a given physical memory range is present in the bootstrap mappings.
369  * Use superpage mappings to ensure that pagetable memory needn't be allocated.
370  */
bootstrap_map(const module_t * mod)371 void *__init bootstrap_map(const module_t *mod)
372 {
373     static unsigned long __initdata map_cur = BOOTSTRAP_MAP_BASE;
374     uint64_t start, end, mask = (1L << L2_PAGETABLE_SHIFT) - 1;
375     void *ret;
376 
377     if ( system_state != SYS_STATE_early_boot )
378         return mod ? mfn_to_virt(mod->mod_start) : NULL;
379 
380     if ( !mod )
381     {
382         destroy_xen_mappings(BOOTSTRAP_MAP_BASE, BOOTSTRAP_MAP_LIMIT);
383         map_cur = BOOTSTRAP_MAP_BASE;
384         return NULL;
385     }
386 
387     start = (uint64_t)mod->mod_start << PAGE_SHIFT;
388     end = start + mod->mod_end;
389     if ( start >= end )
390         return NULL;
391 
392     ret = (void *)(map_cur + (unsigned long)(start & mask));
393     start &= ~mask;
394     end = (end + mask) & ~mask;
395     if ( end - start > BOOTSTRAP_MAP_LIMIT - map_cur )
396         return NULL;
397 
398     map_pages_to_xen(map_cur, maddr_to_mfn(start),
399                      PFN_DOWN(end - start), PAGE_HYPERVISOR);
400     map_cur += end - start;
401     return ret;
402 }
403 
move_memory(uint64_t dst,uint64_t src,unsigned int size,bool keep)404 static void *__init move_memory(
405     uint64_t dst, uint64_t src, unsigned int size, bool keep)
406 {
407     unsigned int blksz = BOOTSTRAP_MAP_LIMIT - BOOTSTRAP_MAP_BASE;
408     unsigned int mask = (1L << L2_PAGETABLE_SHIFT) - 1;
409 
410     if ( src + size > BOOTSTRAP_MAP_BASE )
411         blksz >>= 1;
412 
413     while ( size )
414     {
415         module_t mod;
416         unsigned int soffs = src & mask;
417         unsigned int doffs = dst & mask;
418         unsigned int sz;
419         void *d, *s;
420 
421         mod.mod_start = (src - soffs) >> PAGE_SHIFT;
422         mod.mod_end = soffs + size;
423         if ( mod.mod_end > blksz )
424             mod.mod_end = blksz;
425         sz = mod.mod_end - soffs;
426         s = bootstrap_map(&mod);
427 
428         mod.mod_start = (dst - doffs) >> PAGE_SHIFT;
429         mod.mod_end = doffs + size;
430         if ( mod.mod_end > blksz )
431             mod.mod_end = blksz;
432         if ( sz > mod.mod_end - doffs )
433             sz = mod.mod_end - doffs;
434         d = bootstrap_map(&mod);
435 
436         memmove(d + doffs, s + soffs, sz);
437 
438         dst += sz;
439         src += sz;
440         size -= sz;
441 
442         if ( keep )
443             return size ? NULL : d + doffs;
444 
445         bootstrap_map(NULL);
446     }
447 
448     return NULL;
449 }
450 
451 #undef BOOTSTRAP_MAP_LIMIT
452 
consider_modules(uint64_t s,uint64_t e,uint32_t size,const module_t * mod,unsigned int nr_mods,unsigned int this_mod)453 static uint64_t __init consider_modules(
454     uint64_t s, uint64_t e, uint32_t size, const module_t *mod,
455     unsigned int nr_mods, unsigned int this_mod)
456 {
457     unsigned int i;
458 
459     if ( s > e || e - s < size )
460         return 0;
461 
462     for ( i = 0; i < nr_mods ; ++i )
463     {
464         uint64_t start = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
465         uint64_t end = start + PAGE_ALIGN(mod[i].mod_end);
466 
467         if ( i == this_mod )
468             continue;
469 
470         if ( s < end && start < e )
471         {
472             end = consider_modules(end, e, size, mod + i + 1,
473                                    nr_mods - i - 1, this_mod - i - 1);
474             if ( end )
475                 return end;
476 
477             return consider_modules(s, start, size, mod + i + 1,
478                                     nr_mods - i - 1, this_mod - i - 1);
479         }
480     }
481 
482     return e;
483 }
484 
setup_max_pdx(unsigned long top_page)485 static void __init setup_max_pdx(unsigned long top_page)
486 {
487     max_pdx = pfn_to_pdx(top_page - 1) + 1;
488 
489     if ( max_pdx > (DIRECTMAP_SIZE >> PAGE_SHIFT) )
490         max_pdx = DIRECTMAP_SIZE >> PAGE_SHIFT;
491 
492     if ( max_pdx > FRAMETABLE_NR )
493         max_pdx = FRAMETABLE_NR;
494 
495     if ( max_pdx > MPT_VIRT_SIZE / sizeof(unsigned long) )
496         max_pdx = MPT_VIRT_SIZE / sizeof(unsigned long);
497 
498 #ifdef PAGE_LIST_NULL
499     if ( max_pdx >= PAGE_LIST_NULL )
500         max_pdx = PAGE_LIST_NULL - 1;
501 #endif
502 
503     max_page = pdx_to_pfn(max_pdx - 1) + 1;
504 }
505 
506 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
507 static struct e820map __initdata boot_e820;
508 
509 #ifdef CONFIG_VIDEO
510 struct boot_video_info {
511     u8  orig_x;             /* 0x00 */
512     u8  orig_y;             /* 0x01 */
513     u8  orig_video_mode;    /* 0x02 */
514     u8  orig_video_cols;    /* 0x03 */
515     u8  orig_video_lines;   /* 0x04 */
516     u8  orig_video_isVGA;   /* 0x05 */
517     u16 orig_video_points;  /* 0x06 */
518 
519     /* VESA graphic mode -- linear frame buffer */
520     u32 capabilities;       /* 0x08 */
521     u16 lfb_linelength;     /* 0x0c */
522     u16 lfb_width;          /* 0x0e */
523     u16 lfb_height;         /* 0x10 */
524     u16 lfb_depth;          /* 0x12 */
525     u32 lfb_base;           /* 0x14 */
526     u32 lfb_size;           /* 0x18 */
527     u8  red_size;           /* 0x1c */
528     u8  red_pos;            /* 0x1d */
529     u8  green_size;         /* 0x1e */
530     u8  green_pos;          /* 0x1f */
531     u8  blue_size;          /* 0x20 */
532     u8  blue_pos;           /* 0x21 */
533     u8  rsvd_size;          /* 0x22 */
534     u8  rsvd_pos;           /* 0x23 */
535     u16 vesapm_seg;         /* 0x24 */
536     u16 vesapm_off;         /* 0x26 */
537     u16 vesa_attrib;        /* 0x28 */
538 };
539 extern struct boot_video_info boot_vid_info;
540 #endif
541 
parse_video_info(void)542 static void __init parse_video_info(void)
543 {
544 #ifdef CONFIG_VIDEO
545     struct boot_video_info *bvi = &bootsym(boot_vid_info);
546 
547     /* vga_console_info is filled directly on EFI platform. */
548     if ( efi_enabled(EFI_BOOT) )
549         return;
550 
551     if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
552     {
553         vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
554         vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
555         vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
556         vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
557         vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
558         vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
559     }
560     else if ( bvi->orig_video_isVGA == 0x23 )
561     {
562         vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
563         vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
564         vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
565         vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
566         vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
567         vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
568         vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
569         vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
570         vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
571         vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
572         vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
573         vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
574         vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
575         vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
576         vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
577         vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
578         vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
579     }
580 #endif
581 }
582 
kexec_reserve_area(struct e820map * e820)583 static void __init kexec_reserve_area(struct e820map *e820)
584 {
585 #ifdef CONFIG_KEXEC
586     unsigned long kdump_start = kexec_crash_area.start;
587     unsigned long kdump_size  = kexec_crash_area.size;
588     static bool __initdata is_reserved = false;
589 
590     kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
591 
592     if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
593         return;
594 
595     is_reserved = true;
596 
597     if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
598     {
599         printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at %#lx)"
600                "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
601         kexec_crash_area.start = kexec_crash_area.size = 0;
602     }
603     else
604     {
605         printk("Kdump: %luMB (%lukB) at %#lx\n",
606                kdump_size >> 20, kdump_size >> 10, kdump_start);
607     }
608 #endif
609 }
610 
using_2M_mapping(void)611 static inline bool using_2M_mapping(void)
612 {
613     return !l1_table_offset((unsigned long)__2M_text_end) &&
614            !l1_table_offset((unsigned long)__2M_rodata_start) &&
615            !l1_table_offset((unsigned long)__2M_rodata_end) &&
616            !l1_table_offset((unsigned long)__2M_init_start) &&
617            !l1_table_offset((unsigned long)__2M_init_end) &&
618            !l1_table_offset((unsigned long)__2M_rwdata_start) &&
619            !l1_table_offset((unsigned long)__2M_rwdata_end);
620 }
621 
init_done(void)622 static void noinline init_done(void)
623 {
624     void *va;
625     unsigned long start, end;
626 
627     system_state = SYS_STATE_active;
628 
629     domain_unpause_by_systemcontroller(dom0);
630 
631     /* MUST be done prior to removing .init data. */
632     unregister_init_virtual_region();
633 
634     /* Zero the .init code and data. */
635     for ( va = __init_begin; va < _p(__init_end); va += PAGE_SIZE )
636         clear_page(va);
637 
638     /* Destroy Xen's mappings, and reuse the pages. */
639     if ( using_2M_mapping() )
640     {
641         start = (unsigned long)&__2M_init_start,
642         end   = (unsigned long)&__2M_init_end;
643     }
644     else
645     {
646         start = (unsigned long)&__init_begin;
647         end   = (unsigned long)&__init_end;
648     }
649 
650     destroy_xen_mappings(start, end);
651     init_xenheap_pages(__pa(start), __pa(end));
652     printk("Freed %lukB init memory\n", (end - start) >> 10);
653 
654     startup_cpu_idle_loop();
655 }
656 
657 /* Reinitalise all state referring to the old virtual address of the stack. */
reinit_bsp_stack(void)658 static void __init noreturn reinit_bsp_stack(void)
659 {
660     unsigned long *stack = (void*)(get_stack_bottom() & ~(STACK_SIZE - 1));
661 
662     /* Update TSS and ISTs */
663     load_system_tables();
664 
665     /* Update SYSCALL trampolines */
666     percpu_traps_init();
667 
668     stack_base[0] = stack;
669     memguard_guard_stack(stack);
670 
671     if ( IS_ENABLED(CONFIG_XEN_SHSTK) && cpu_has_xen_shstk )
672     {
673         wrmsrl(MSR_PL0_SSP,
674                (unsigned long)stack + (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8);
675         wrmsrl(MSR_S_CET, CET_SHSTK_EN | CET_WRSS_EN);
676         asm volatile ("setssbsy" ::: "memory");
677     }
678 
679     reset_stack_and_jump(init_done);
680 }
681 
682 /*
683  * Some scripts add "placeholder" to work around a grub error where it ate the
684  * first parameter.
685  */
686 ignore_param("placeholder");
687 
loader_is_grub2(const char * loader_name)688 static bool __init loader_is_grub2(const char *loader_name)
689 {
690     /* GRUB1="GNU GRUB 0.xx"; GRUB2="GRUB 1.xx" */
691     const char *p = strstr(loader_name, "GRUB ");
692     return (p != NULL) && (p[5] != '0');
693 }
694 
cmdline_cook(char * p,const char * loader_name)695 static char * __init cmdline_cook(char *p, const char *loader_name)
696 {
697     p = p ? : "";
698 
699     /* Strip leading whitespace. */
700     while ( *p == ' ' )
701         p++;
702 
703     /* GRUB2 and PVH don't not include image name as first item on command line. */
704     if ( xen_guest || loader_is_grub2(loader_name) )
705         return p;
706 
707     /* Strip image name plus whitespace. */
708     while ( (*p != ' ') && (*p != '\0') )
709         p++;
710     while ( *p == ' ' )
711         p++;
712 
713     return p;
714 }
715 
copy_bios_e820(struct e820entry * map,unsigned int limit)716 static unsigned int __init copy_bios_e820(struct e820entry *map, unsigned int limit)
717 {
718     unsigned int n = min(bootsym(bios_e820nr), limit);
719 
720     if ( n )
721         memcpy(map, bootsym(bios_e820map), sizeof(*map) * n);
722 
723     return n;
724 }
725 
create_dom0(const module_t * image,unsigned long headroom,module_t * initrd,const char * kextra,const char * loader)726 static struct domain *__init create_dom0(const module_t *image,
727                                          unsigned long headroom,
728                                          module_t *initrd, const char *kextra,
729                                          const char *loader)
730 {
731     struct xen_domctl_createdomain dom0_cfg = {
732         .flags = IS_ENABLED(CONFIG_TBOOT) ? XEN_DOMCTL_CDF_s3_integrity : 0,
733         .max_evtchn_port = -1,
734         .max_grant_frames = -1,
735         .max_maptrack_frames = -1,
736         .max_vcpus = dom0_max_vcpus(),
737     };
738     struct domain *d;
739     char *cmdline;
740 
741     if ( opt_dom0_pvh )
742     {
743         dom0_cfg.flags |= (XEN_DOMCTL_CDF_hvm |
744                            ((hvm_hap_supported() && !opt_dom0_shadow) ?
745                             XEN_DOMCTL_CDF_hap : 0));
746 
747         dom0_cfg.arch.emulation_flags |=
748             XEN_X86_EMU_LAPIC | XEN_X86_EMU_IOAPIC | XEN_X86_EMU_VPCI;
749     }
750 
751     if ( iommu_enabled )
752         dom0_cfg.flags |= XEN_DOMCTL_CDF_iommu;
753 
754     /* Create initial domain 0. */
755     d = domain_create(get_initial_domain_id(), &dom0_cfg, !pv_shim);
756     if ( IS_ERR(d) || (alloc_dom0_vcpu0(d) == NULL) )
757         panic("Error creating domain 0\n");
758 
759     /* Grab the DOM0 command line. */
760     cmdline = image->string ? __va(image->string) : NULL;
761     if ( cmdline || kextra )
762     {
763         static char __initdata dom0_cmdline[MAX_GUEST_CMDLINE];
764 
765         cmdline = cmdline_cook(cmdline, loader);
766         safe_strcpy(dom0_cmdline, cmdline);
767 
768         if ( kextra )
769             /* kextra always includes exactly one leading space. */
770             safe_strcat(dom0_cmdline, kextra);
771 
772         /* Append any extra parameters. */
773         if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
774             safe_strcat(dom0_cmdline, " noapic");
775         if ( (strlen(acpi_param) == 0) && acpi_disabled )
776         {
777             printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
778             safe_strcpy(acpi_param, "off");
779         }
780         if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
781         {
782             safe_strcat(dom0_cmdline, " acpi=");
783             safe_strcat(dom0_cmdline, acpi_param);
784         }
785 
786         cmdline = dom0_cmdline;
787     }
788 
789     /*
790      * Temporarily clear SMAP in CR4 to allow user-accesses in construct_dom0().
791      * This saves a large number of corner cases interactions with
792      * copy_from_user().
793      */
794     if ( cpu_has_smap )
795     {
796         cr4_pv32_mask &= ~X86_CR4_SMAP;
797         write_cr4(read_cr4() & ~X86_CR4_SMAP);
798     }
799 
800     if ( construct_dom0(d, image, headroom, initrd, cmdline) != 0 )
801         panic("Could not construct domain 0\n");
802 
803     if ( cpu_has_smap )
804     {
805         write_cr4(read_cr4() | X86_CR4_SMAP);
806         cr4_pv32_mask |= X86_CR4_SMAP;
807     }
808 
809     return d;
810 }
811 
812 /* How much of the directmap is prebuilt at compile time. */
813 #define PREBUILT_MAP_LIMIT (1 << L2_PAGETABLE_SHIFT)
814 
__start_xen(unsigned long mbi_p)815 void __init noreturn __start_xen(unsigned long mbi_p)
816 {
817     char *memmap_type = NULL;
818     char *cmdline, *kextra, *loader;
819     unsigned int initrdidx, num_parked = 0;
820     multiboot_info_t *mbi;
821     module_t *mod;
822     unsigned long nr_pages, raw_max_page, modules_headroom, module_map[1];
823     int i, j, e820_warn = 0, bytes = 0;
824     bool acpi_boot_table_init_done = false, relocated = false;
825     int ret;
826     struct ns16550_defaults ns16550 = {
827         .data_bits = 8,
828         .parity    = 'n',
829         .stop_bits = 1
830     };
831     const char *hypervisor_name;
832 
833     /* Critical region without IDT or TSS.  Any fault is deadly! */
834 
835     init_shadow_spec_ctrl_state();
836 
837     percpu_init_areas();
838 
839     init_idt_traps();
840     load_system_tables();
841 
842     smp_prepare_boot_cpu();
843     sort_exception_tables();
844 
845     setup_virtual_regions(__start___ex_table, __stop___ex_table);
846 
847     /* Full exception support from here on in. */
848 
849     /* Enable NMIs.  Our loader (e.g. Tboot) may have left them disabled. */
850     enable_nmis();
851 
852     if ( pvh_boot )
853     {
854         ASSERT(mbi_p == 0);
855         pvh_init(&mbi, &mod);
856     }
857     else
858     {
859         mbi = __va(mbi_p);
860         mod = __va(mbi->mods_addr);
861     }
862 
863     loader = (mbi->flags & MBI_LOADERNAME)
864         ? (char *)__va(mbi->boot_loader_name) : "unknown";
865 
866     /* Parse the command-line options. */
867     cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
868                            __va(mbi->cmdline) : NULL,
869                            loader);
870     if ( (kextra = strstr(cmdline, " -- ")) != NULL )
871     {
872         /*
873          * Options after ' -- ' separator belong to dom0.
874          *  1. Orphan dom0's options from Xen's command line.
875          *  2. Skip all but final leading space from dom0's options.
876          */
877         *kextra = '\0';
878         kextra += 3;
879         while ( kextra[1] == ' ' ) kextra++;
880     }
881     cmdline_parse(cmdline);
882 
883     /* Must be after command line argument parsing and before
884      * allocing any xenheap structures wanted in lower memory. */
885     kexec_early_calculations();
886 
887     /*
888      * The probing has to be done _before_ initialising console,
889      * otherwise we couldn't set up Xen's PV console correctly.
890      */
891     hypervisor_name = hypervisor_probe();
892 
893     parse_video_info();
894 
895     rdmsrl(MSR_EFER, this_cpu(efer));
896     asm volatile ( "mov %%cr4,%0" : "=r" (get_cpu_info()->cr4) );
897 
898     /* We initialise the serial devices very early so we can get debugging. */
899     ns16550.io_base = 0x3f8;
900     ns16550.irq     = 4;
901     ns16550_init(0, &ns16550);
902     ns16550.io_base = 0x2f8;
903     ns16550.irq     = 3;
904     ns16550_init(1, &ns16550);
905     ehci_dbgp_init();
906     console_init_preirq();
907 
908     if ( pvh_boot )
909         pvh_print_info();
910 
911     printk("Bootloader: %s\n", loader);
912 
913     printk("Command line: %s\n", cmdline);
914 
915     printk("Xen image load base address: %#lx\n", xen_phys_start);
916     if ( hypervisor_name )
917         printk("Running on %s\n", hypervisor_name);
918 
919 #ifdef CONFIG_VIDEO
920     printk("Video information:\n");
921 
922     /* Print VGA display mode information. */
923     switch ( vga_console_info.video_type )
924     {
925     case XEN_VGATYPE_TEXT_MODE_3:
926         printk(" VGA is text mode %dx%d, font 8x%d\n",
927                vga_console_info.u.text_mode_3.columns,
928                vga_console_info.u.text_mode_3.rows,
929                vga_console_info.u.text_mode_3.font_height);
930         break;
931     case XEN_VGATYPE_VESA_LFB:
932     case XEN_VGATYPE_EFI_LFB:
933         printk(" VGA is graphics mode %dx%d, %d bpp\n",
934                vga_console_info.u.vesa_lfb.width,
935                vga_console_info.u.vesa_lfb.height,
936                vga_console_info.u.vesa_lfb.bits_per_pixel);
937         break;
938     default:
939         printk(" No VGA detected\n");
940         break;
941     }
942 
943     /* Print VBE/DDC EDID information. */
944     if ( bootsym(boot_edid_caps) != 0x1313 )
945     {
946         u16 caps = bootsym(boot_edid_caps);
947         printk(" VBE/DDC methods:%s%s%s; ",
948                (caps & 1) ? " V1" : "",
949                (caps & 2) ? " V2" : "",
950                !(caps & 3) ? " none" : "");
951         printk("EDID transfer time: %d seconds\n", caps >> 8);
952         if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
953         {
954             printk(" EDID info not retrieved because ");
955             if ( !(caps & 3) )
956                 printk("no DDC retrieval method detected\n");
957             else if ( (caps >> 8) > 5 )
958                 printk("takes longer than 5 seconds\n");
959             else
960                 printk("of reasons unknown\n");
961         }
962     }
963 #endif
964 
965     printk("Disc information:\n");
966     printk(" Found %d MBR signatures\n",
967            bootsym(boot_mbr_signature_nr));
968     printk(" Found %d EDD information structures\n",
969            bootsym(boot_edd_info_nr));
970 
971     /* Check that we have at least one Multiboot module. */
972     if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
973         panic("dom0 kernel not specified. Check bootloader configuration\n");
974 
975     /* Check that we don't have a silly number of modules. */
976     if ( mbi->mods_count > sizeof(module_map) * 8 )
977     {
978         mbi->mods_count = sizeof(module_map) * 8;
979         printk("Excessive multiboot modules - using the first %u only\n",
980                mbi->mods_count);
981     }
982 
983     bitmap_fill(module_map, mbi->mods_count);
984     __clear_bit(0, module_map); /* Dom0 kernel is always first */
985 
986     if ( pvh_boot )
987     {
988         /* pvh_init() already filled in e820_raw */
989         memmap_type = "PVH-e820";
990     }
991     else if ( efi_enabled(EFI_LOADER) )
992     {
993         set_pdx_range(xen_phys_start >> PAGE_SHIFT,
994                       (xen_phys_start + BOOTSTRAP_MAP_BASE) >> PAGE_SHIFT);
995 
996         /* Clean up boot loader identity mappings. */
997         destroy_xen_mappings(xen_phys_start,
998                              xen_phys_start + BOOTSTRAP_MAP_BASE);
999 
1000         /* Make boot page tables match non-EFI boot. */
1001         l3_bootmap[l3_table_offset(BOOTSTRAP_MAP_BASE)] =
1002             l3e_from_paddr(__pa(l2_bootmap), __PAGE_HYPERVISOR);
1003 
1004         memmap_type = loader;
1005     }
1006     else if ( efi_enabled(EFI_BOOT) )
1007         memmap_type = "EFI";
1008     else if ( (e820_raw.nr_map =
1009                    copy_bios_e820(e820_raw.map,
1010                                   ARRAY_SIZE(e820_raw.map))) != 0 )
1011     {
1012         memmap_type = "Xen-e820";
1013     }
1014     else if ( mbi->flags & MBI_MEMMAP )
1015     {
1016         memmap_type = "Multiboot-e820";
1017         while ( bytes < mbi->mmap_length &&
1018                 e820_raw.nr_map < ARRAY_SIZE(e820_raw.map) )
1019         {
1020             memory_map_t *map = __va(mbi->mmap_addr + bytes);
1021 
1022             /*
1023              * This is a gross workaround for a BIOS bug. Some bootloaders do
1024              * not write e820 map entries into pre-zeroed memory. This is
1025              * okay if the BIOS fills in all fields of the map entry, but
1026              * some broken BIOSes do not bother to write the high word of
1027              * the length field if the length is smaller than 4GB. We
1028              * detect and fix this by flagging sections below 4GB that
1029              * appear to be larger than 4GB in size.
1030              */
1031             if ( (map->base_addr_high == 0) && (map->length_high != 0) )
1032             {
1033                 if ( !e820_warn )
1034                 {
1035                     printk("WARNING: Buggy e820 map detected and fixed "
1036                            "(truncated length fields).\n");
1037                     e820_warn = 1;
1038                 }
1039                 map->length_high = 0;
1040             }
1041 
1042             e820_raw.map[e820_raw.nr_map].addr =
1043                 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
1044             e820_raw.map[e820_raw.nr_map].size =
1045                 ((u64)map->length_high << 32) | (u64)map->length_low;
1046             e820_raw.map[e820_raw.nr_map].type = map->type;
1047             e820_raw.nr_map++;
1048 
1049             bytes += map->size + 4;
1050         }
1051     }
1052     else if ( bootsym(lowmem_kb) )
1053     {
1054         memmap_type = "Xen-e801";
1055         e820_raw.map[0].addr = 0;
1056         e820_raw.map[0].size = bootsym(lowmem_kb) << 10;
1057         e820_raw.map[0].type = E820_RAM;
1058         e820_raw.map[1].addr = 0x100000;
1059         e820_raw.map[1].size = bootsym(highmem_kb) << 10;
1060         e820_raw.map[1].type = E820_RAM;
1061         e820_raw.nr_map = 2;
1062     }
1063     else if ( mbi->flags & MBI_MEMLIMITS )
1064     {
1065         memmap_type = "Multiboot-e801";
1066         e820_raw.map[0].addr = 0;
1067         e820_raw.map[0].size = mbi->mem_lower << 10;
1068         e820_raw.map[0].type = E820_RAM;
1069         e820_raw.map[1].addr = 0x100000;
1070         e820_raw.map[1].size = mbi->mem_upper << 10;
1071         e820_raw.map[1].type = E820_RAM;
1072         e820_raw.nr_map = 2;
1073     }
1074     else
1075         panic("Bootloader provided no memory information\n");
1076 
1077     /* This must come before e820 code because it sets paddr_bits. */
1078     early_cpu_init();
1079 
1080     /* Choose shadow stack early, to set infrastructure up appropriately. */
1081     if ( opt_xen_shstk && boot_cpu_has(X86_FEATURE_CET_SS) )
1082     {
1083         printk("Enabling Supervisor Shadow Stacks\n");
1084 
1085         setup_force_cpu_cap(X86_FEATURE_XEN_SHSTK);
1086 #ifdef CONFIG_PV32
1087         if ( opt_pv32 )
1088         {
1089             opt_pv32 = 0;
1090             printk("  - Disabling PV32 due to Shadow Stacks\n");
1091         }
1092 #endif
1093     }
1094 
1095     /* Sanitise the raw E820 map to produce a final clean version. */
1096     max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
1097 
1098     if ( !efi_enabled(EFI_BOOT) && e820_raw.nr_map >= 1 )
1099     {
1100         /*
1101          * Supplement the heuristics in l1tf_calculations() by assuming that
1102          * anything referenced in the E820 may be cacheable.
1103          */
1104         l1tf_safe_maddr =
1105             max(l1tf_safe_maddr,
1106                 ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr +
1107                         e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE));
1108     }
1109 
1110     /* Create a temporary copy of the E820 map. */
1111     memcpy(&boot_e820, &e820, sizeof(e820));
1112 
1113     /* Early kexec reservation (explicit static start address). */
1114     nr_pages = 0;
1115     for ( i = 0; i < e820.nr_map; i++ )
1116         if ( e820.map[i].type == E820_RAM )
1117             nr_pages += e820.map[i].size >> PAGE_SHIFT;
1118     set_kexec_crash_area_size((u64)nr_pages << PAGE_SHIFT);
1119     kexec_reserve_area(&boot_e820);
1120 
1121     initial_images = mod;
1122     nr_initial_images = mbi->mods_count;
1123 
1124     for ( i = 0; !efi_enabled(EFI_LOADER) && i < mbi->mods_count; i++ )
1125     {
1126         if ( mod[i].mod_start & (PAGE_SIZE - 1) )
1127             panic("Bootloader didn't honor module alignment request\n");
1128         mod[i].mod_end -= mod[i].mod_start;
1129         mod[i].mod_start >>= PAGE_SHIFT;
1130         mod[i].reserved = 0;
1131     }
1132 
1133     if ( xen_phys_start )
1134     {
1135         relocated = true;
1136 
1137         /*
1138          * This needs to remain in sync with xen_in_range() and the
1139          * respective reserve_e820_ram() invocation below.
1140          */
1141         mod[mbi->mods_count].mod_start = virt_to_mfn(_stext);
1142         mod[mbi->mods_count].mod_end = __2M_rwdata_end - _stext;
1143     }
1144 
1145     modules_headroom = bzimage_headroom(bootstrap_map(mod), mod->mod_end);
1146     bootstrap_map(NULL);
1147 
1148 #ifndef highmem_start
1149     /* Don't allow split below 4Gb. */
1150     if ( highmem_start < GB(4) )
1151         highmem_start = 0;
1152     else /* align to L3 entry boundary */
1153         highmem_start &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
1154 #endif
1155 
1156     /*
1157      * Iterate backwards over all superpage-aligned RAM regions.
1158      *
1159      * We require superpage alignment because the boot allocator is
1160      * not yet initialised. Hence we can only map superpages in the
1161      * address range PREBUILT_MAP_LIMIT to 4GB, as this is guaranteed
1162      * not to require dynamic allocation of pagetables.
1163      *
1164      * As well as mapping superpages in that range, in preparation for
1165      * initialising the boot allocator, we also look for a region to which
1166      * we can relocate the dom0 kernel and other multiboot modules. Also, on
1167      * x86/64, we relocate Xen to higher memory.
1168      */
1169     for ( i = boot_e820.nr_map-1; i >= 0; i-- )
1170     {
1171         uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
1172         uint64_t end, limit = ARRAY_SIZE(l2_directmap) << L2_PAGETABLE_SHIFT;
1173 
1174         if ( boot_e820.map[i].type != E820_RAM )
1175             continue;
1176 
1177         /* Superpage-aligned chunks from PREBUILT_MAP_LIMIT. */
1178         s = (boot_e820.map[i].addr + mask) & ~mask;
1179         e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1180         s = max_t(uint64_t, s, PREBUILT_MAP_LIMIT);
1181         if ( s >= e )
1182             continue;
1183 
1184         if ( s < limit )
1185         {
1186             end = min(e, limit);
1187             set_pdx_range(s >> PAGE_SHIFT, end >> PAGE_SHIFT);
1188             map_pages_to_xen((unsigned long)__va(s), maddr_to_mfn(s),
1189                              PFN_DOWN(end - s), PAGE_HYPERVISOR);
1190         }
1191 
1192         if ( e > min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
1193                      1UL << (PAGE_SHIFT + 32)) )
1194             e = min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
1195                     1UL << (PAGE_SHIFT + 32));
1196 #define reloc_size ((__pa(__2M_rwdata_end) + mask) & ~mask)
1197         /* Is the region suitable for relocating Xen? */
1198         if ( !xen_phys_start && e <= limit )
1199         {
1200             /* Don't overlap with modules. */
1201             end = consider_modules(s, e, reloc_size + mask,
1202                                    mod, mbi->mods_count, -1);
1203             end &= ~mask;
1204         }
1205         else
1206             end = 0;
1207 
1208         /*
1209          * Is the region size greater than zero and does it begin
1210          * at or above the end of current Xen image placement?
1211          */
1212         if ( (end > s) && (end - reloc_size + XEN_IMG_OFFSET >= __pa(_end)) )
1213         {
1214             l4_pgentry_t *pl4e;
1215             l3_pgentry_t *pl3e;
1216             l2_pgentry_t *pl2e;
1217             int i, j, k;
1218             unsigned long pte_update_limit;
1219 
1220             /* Select relocation address. */
1221             xen_phys_start = end - reloc_size;
1222             e = xen_phys_start + XEN_IMG_OFFSET;
1223             bootsym(trampoline_xen_phys_start) = xen_phys_start;
1224 
1225             /*
1226              * No PTEs pointing above this address are candidates for relocation.
1227              * Due to possibility of partial overlap of the end of source image
1228              * and the beginning of region for destination image some PTEs may
1229              * point to addresses in range [e, e + XEN_IMG_OFFSET).
1230              */
1231             pte_update_limit = PFN_DOWN(e);
1232 
1233             /*
1234              * Perform relocation to new physical address.
1235              * Before doing so we must sync static/global data with main memory
1236              * with a barrier(). After this we must *not* modify static/global
1237              * data until after we have switched to the relocated pagetables!
1238              */
1239             barrier();
1240             move_memory(e, XEN_IMG_OFFSET, _end - _start, 1);
1241 
1242             /* Walk initial pagetables, relocating page directory entries. */
1243             pl4e = __va(__pa(idle_pg_table));
1244             for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
1245             {
1246                 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
1247                     continue;
1248                 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
1249                                         xen_phys_start);
1250                 pl3e = l4e_to_l3e(*pl4e);
1251                 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
1252                 {
1253                     /* Not present, 1GB mapping, or already relocated? */
1254                     if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
1255                          (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
1256                          (l3e_get_pfn(*pl3e) >= pte_update_limit) )
1257                         continue;
1258                     *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
1259                                             xen_phys_start);
1260                     pl2e = l3e_to_l2e(*pl3e);
1261                     for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
1262                     {
1263                         /* Not present, PSE, or already relocated? */
1264                         if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
1265                              (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
1266                              (l2e_get_pfn(*pl2e) >= pte_update_limit) )
1267                             continue;
1268                         *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
1269                                                 xen_phys_start);
1270                     }
1271                 }
1272             }
1273 
1274             /* The only data mappings to be relocated are in the Xen area. */
1275             pl2e = __va(__pa(l2_xenmap));
1276             /*
1277              * Undo the temporary-hooking of the l1_directmap.  __2M_text_start
1278              * is contained in this PTE.
1279              */
1280             BUG_ON(using_2M_mapping() &&
1281                    l2_table_offset((unsigned long)_erodata) ==
1282                    l2_table_offset((unsigned long)_stext));
1283             *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
1284                                    PAGE_HYPERVISOR_RX | _PAGE_PSE);
1285             for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
1286             {
1287                 unsigned int flags;
1288 
1289                 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
1290                      (l2e_get_pfn(*pl2e) >= pte_update_limit) )
1291                     continue;
1292 
1293                 if ( !using_2M_mapping() )
1294                 {
1295                     *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
1296                                             xen_phys_start);
1297                     continue;
1298                 }
1299 
1300                 if ( i < l2_table_offset((unsigned long)&__2M_text_end) )
1301                 {
1302                     flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
1303                 }
1304                 else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
1305                           i <  l2_table_offset((unsigned long)&__2M_rodata_end) )
1306                 {
1307                     flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
1308                 }
1309                 else if ( i >= l2_table_offset((unsigned long)&__2M_init_start) &&
1310                           i <  l2_table_offset((unsigned long)&__2M_init_end) )
1311                 {
1312                     flags = PAGE_HYPERVISOR_RWX | _PAGE_PSE;
1313                 }
1314                 else if ( (i >= l2_table_offset((unsigned long)&__2M_rwdata_start) &&
1315                            i <  l2_table_offset((unsigned long)&__2M_rwdata_end)) )
1316                 {
1317                     flags = PAGE_HYPERVISOR_RW | _PAGE_PSE;
1318                 }
1319                 else
1320                 {
1321                     *pl2e = l2e_empty();
1322                     continue;
1323                 }
1324 
1325                 *pl2e = l2e_from_paddr(
1326                     l2e_get_paddr(*pl2e) + xen_phys_start, flags);
1327             }
1328 
1329             /* Re-sync the stack and then switch to relocated pagetables. */
1330             asm volatile (
1331                 "rep movsq        ; " /* re-sync the stack */
1332                 "movq %%cr4,%%rsi ; "
1333                 "andb $0x7f,%%sil ; "
1334                 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
1335                 "movq %[pg],%%cr3 ; " /* CR3 == new pagetables */
1336                 "orb $0x80,%%sil  ; "
1337                 "movq %%rsi,%%cr4   " /* CR4.PGE == 1 */
1338                 : "=&S" (i), "=&D" (i), "=&c" (i) /* All outputs discarded. */
1339                 :  [pg] "r" (__pa(idle_pg_table)), "0" (cpu0_stack),
1340                    "1" (__va(__pa(cpu0_stack))), "2" (STACK_SIZE / 8)
1341                 : "memory" );
1342 
1343             bootstrap_map(NULL);
1344 
1345             printk("New Xen image base address: %#lx\n", xen_phys_start);
1346         }
1347 
1348         /* Is the region suitable for relocating the multiboot modules? */
1349         for ( j = mbi->mods_count - 1; j >= 0; j-- )
1350         {
1351             unsigned long headroom = j ? 0 : modules_headroom;
1352             unsigned long size = PAGE_ALIGN(headroom + mod[j].mod_end);
1353 
1354             if ( mod[j].reserved )
1355                 continue;
1356 
1357             /* Don't overlap with other modules (or Xen itself). */
1358             end = consider_modules(s, e, size, mod,
1359                                    mbi->mods_count + relocated, j);
1360 
1361             if ( highmem_start && end > highmem_start )
1362                 continue;
1363 
1364             if ( s < end &&
1365                  (headroom ||
1366                   ((end - size) >> PAGE_SHIFT) > mod[j].mod_start) )
1367             {
1368                 move_memory(end - size + headroom,
1369                             (uint64_t)mod[j].mod_start << PAGE_SHIFT,
1370                             mod[j].mod_end, 0);
1371                 mod[j].mod_start = (end - size) >> PAGE_SHIFT;
1372                 mod[j].mod_end += headroom;
1373                 mod[j].reserved = 1;
1374             }
1375         }
1376 
1377 #ifdef CONFIG_KEXEC
1378         /*
1379          * Looking backwards from the crash area limit, find a large
1380          * enough range that does not overlap with modules.
1381          */
1382         while ( !kexec_crash_area.start )
1383         {
1384             /* Don't overlap with modules (or Xen itself). */
1385             e = consider_modules(s, e, PAGE_ALIGN(kexec_crash_area.size), mod,
1386                                  mbi->mods_count + relocated, -1);
1387             if ( s >= e )
1388                 break;
1389             if ( e > kexec_crash_area_limit )
1390             {
1391                 e = kexec_crash_area_limit & PAGE_MASK;
1392                 continue;
1393             }
1394             kexec_crash_area.start = (e - kexec_crash_area.size) & PAGE_MASK;
1395         }
1396 #endif
1397     }
1398 
1399     if ( modules_headroom && !mod->reserved )
1400         panic("Not enough memory to relocate the dom0 kernel image\n");
1401     for ( i = 0; i < mbi->mods_count; ++i )
1402     {
1403         uint64_t s = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
1404 
1405         reserve_e820_ram(&boot_e820, s, s + PAGE_ALIGN(mod[i].mod_end));
1406     }
1407 
1408     if ( !xen_phys_start )
1409         panic("Not enough memory to relocate Xen\n");
1410 
1411     /* This needs to remain in sync with xen_in_range(). */
1412     reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
1413 
1414     /* Late kexec reservation (dynamic start address). */
1415     kexec_reserve_area(&boot_e820);
1416 
1417     setup_max_pdx(raw_max_page);
1418     if ( highmem_start )
1419         xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
1420 
1421     /*
1422      * Walk every RAM region and map it in its entirety (on x86/64, at least)
1423      * and notify it to the boot allocator.
1424      */
1425     for ( i = 0; i < boot_e820.nr_map; i++ )
1426     {
1427         uint64_t s, e, mask = PAGE_SIZE - 1;
1428         uint64_t map_s, map_e;
1429 
1430         if ( boot_e820.map[i].type != E820_RAM )
1431             continue;
1432 
1433         /* Only page alignment required now. */
1434         s = (boot_e820.map[i].addr + mask) & ~mask;
1435         e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1436         s = max_t(uint64_t, s, 1<<20);
1437         if ( s >= e )
1438             continue;
1439 
1440         if ( !acpi_boot_table_init_done &&
1441              s >= (1ULL << 32) &&
1442              !acpi_boot_table_init() )
1443         {
1444             acpi_boot_table_init_done = true;
1445             srat_parse_regions(s);
1446             setup_max_pdx(raw_max_page);
1447         }
1448 
1449         if ( pfn_to_pdx((e - 1) >> PAGE_SHIFT) >= max_pdx )
1450         {
1451             if ( pfn_to_pdx(s >> PAGE_SHIFT) >= max_pdx )
1452             {
1453                 for ( j = i - 1; ; --j )
1454                 {
1455                     if ( boot_e820.map[j].type == E820_RAM )
1456                         break;
1457                     ASSERT(j);
1458                 }
1459                 map_e = boot_e820.map[j].addr + boot_e820.map[j].size;
1460                 for ( j = 0; j < mbi->mods_count; ++j )
1461                 {
1462                     uint64_t end = pfn_to_paddr(mod[j].mod_start) +
1463                                    mod[j].mod_end;
1464 
1465                     if ( map_e < end )
1466                         map_e = end;
1467                 }
1468                 if ( PFN_UP(map_e) < max_page )
1469                 {
1470                     max_page = PFN_UP(map_e);
1471                     max_pdx = pfn_to_pdx(max_page - 1) + 1;
1472                 }
1473                 printk(XENLOG_WARNING "Ignoring inaccessible memory range"
1474                                       " %013"PRIx64"-%013"PRIx64"\n",
1475                        s, e);
1476                 continue;
1477             }
1478             map_e = e;
1479             e = (pdx_to_pfn(max_pdx - 1) + 1ULL) << PAGE_SHIFT;
1480             printk(XENLOG_WARNING "Ignoring inaccessible memory range"
1481                                   " %013"PRIx64"-%013"PRIx64"\n",
1482                    e, map_e);
1483         }
1484 
1485         set_pdx_range(s >> PAGE_SHIFT, e >> PAGE_SHIFT);
1486 
1487         /* Need to create mappings above PREBUILT_MAP_LIMIT. */
1488         map_s = max_t(uint64_t, s, PREBUILT_MAP_LIMIT);
1489         map_e = min_t(uint64_t, e,
1490                       ARRAY_SIZE(l2_directmap) << L2_PAGETABLE_SHIFT);
1491 
1492         /* Pass mapped memory to allocator /before/ creating new mappings. */
1493         init_boot_pages(s, min(map_s, e));
1494         s = map_s;
1495         if ( s < map_e )
1496         {
1497             uint64_t mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
1498 
1499             map_s = (s + mask) & ~mask;
1500             map_e &= ~mask;
1501             init_boot_pages(map_s, map_e);
1502         }
1503 
1504         if ( map_s > map_e )
1505             map_s = map_e = s;
1506 
1507         /* Create new mappings /before/ passing memory to the allocator. */
1508         if ( map_e < e )
1509         {
1510             uint64_t limit = __pa(HYPERVISOR_VIRT_END - 1) + 1;
1511             uint64_t end = min(e, limit);
1512 
1513             if ( map_e < end )
1514             {
1515                 map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
1516                                  PFN_DOWN(end - map_e), PAGE_HYPERVISOR);
1517                 init_boot_pages(map_e, end);
1518                 map_e = end;
1519             }
1520         }
1521         if ( map_e < e )
1522         {
1523             /* This range must not be passed to the boot allocator and
1524              * must also not be mapped with _PAGE_GLOBAL. */
1525             map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
1526                              PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
1527         }
1528         if ( s < map_s )
1529         {
1530             map_pages_to_xen((unsigned long)__va(s), maddr_to_mfn(s),
1531                              PFN_DOWN(map_s - s), PAGE_HYPERVISOR);
1532             init_boot_pages(s, map_s);
1533         }
1534     }
1535 
1536     for ( i = 0; i < mbi->mods_count; ++i )
1537     {
1538         set_pdx_range(mod[i].mod_start,
1539                       mod[i].mod_start + PFN_UP(mod[i].mod_end));
1540         map_pages_to_xen((unsigned long)mfn_to_virt(mod[i].mod_start),
1541                          _mfn(mod[i].mod_start),
1542                          PFN_UP(mod[i].mod_end), PAGE_HYPERVISOR);
1543     }
1544 
1545 #ifdef CONFIG_KEXEC
1546     if ( kexec_crash_area.size )
1547     {
1548         unsigned long s = PFN_DOWN(kexec_crash_area.start);
1549         unsigned long e = min(s + PFN_UP(kexec_crash_area.size),
1550                               PFN_UP(__pa(HYPERVISOR_VIRT_END - 1)));
1551 
1552         if ( e > s )
1553             map_pages_to_xen((unsigned long)__va(kexec_crash_area.start),
1554                              _mfn(s), e - s, PAGE_HYPERVISOR);
1555     }
1556 #endif
1557 
1558     xen_virt_end = ((unsigned long)_end + (1UL << L2_PAGETABLE_SHIFT) - 1) &
1559                    ~((1UL << L2_PAGETABLE_SHIFT) - 1);
1560     destroy_xen_mappings(xen_virt_end, XEN_VIRT_START + BOOTSTRAP_MAP_BASE);
1561 
1562     /*
1563      * If not using 2M mappings to gain suitable pagetable permissions
1564      * directly from the relocation above, remap the code/data
1565      * sections with decreased permissions.
1566      */
1567     if ( !using_2M_mapping() )
1568     {
1569         /* Mark .text as RX (avoiding the first 2M superpage). */
1570         modify_xen_mappings(XEN_VIRT_START + MB(2),
1571                             (unsigned long)&__2M_text_end,
1572                             PAGE_HYPERVISOR_RX);
1573 
1574         /* Mark .rodata as RO. */
1575         modify_xen_mappings((unsigned long)&__2M_rodata_start,
1576                             (unsigned long)&__2M_rodata_end,
1577                             PAGE_HYPERVISOR_RO);
1578 
1579         /* Mark .data and .bss as RW. */
1580         modify_xen_mappings((unsigned long)&__2M_rwdata_start,
1581                             (unsigned long)&__2M_rwdata_end,
1582                             PAGE_HYPERVISOR_RW);
1583 
1584         /* Drop the remaining mappings in the shattered superpage. */
1585         destroy_xen_mappings((unsigned long)&__2M_rwdata_end,
1586                              ROUNDUP((unsigned long)&__2M_rwdata_end, MB(2)));
1587     }
1588 
1589     nr_pages = 0;
1590     for ( i = 0; i < e820.nr_map; i++ )
1591         if ( e820.map[i].type == E820_RAM )
1592             nr_pages += e820.map[i].size >> PAGE_SHIFT;
1593     printk("System RAM: %luMB (%lukB)\n",
1594            nr_pages >> (20 - PAGE_SHIFT),
1595            nr_pages << (PAGE_SHIFT - 10));
1596     total_pages = nr_pages;
1597 
1598     /* Sanity check for unwanted bloat of certain hypercall structures. */
1599     BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
1600                  sizeof(((struct xen_platform_op *)0)->u.pad));
1601     BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
1602                  sizeof(((struct xen_domctl *)0)->u.pad));
1603     BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
1604                  sizeof(((struct xen_sysctl *)0)->u.pad));
1605 
1606     BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
1607     BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
1608     BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
1609 
1610     BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
1611                  sizeof(((struct compat_platform_op *)0)->u.pad));
1612     BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
1613     BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
1614 
1615     /* Check definitions in public headers match internal defs. */
1616     BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
1617     BUILD_BUG_ON(__HYPERVISOR_VIRT_END   != HYPERVISOR_VIRT_END);
1618     BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
1619     BUILD_BUG_ON(MACH2PHYS_VIRT_END   != RO_MPT_VIRT_END);
1620 
1621     init_frametable();
1622 
1623     if ( !acpi_boot_table_init_done )
1624         acpi_boot_table_init();
1625 
1626     acpi_numa_init();
1627 
1628     numa_initmem_init(0, raw_max_page);
1629 
1630     if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
1631     {
1632         unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
1633         uint64_t mask = PAGE_SIZE - 1;
1634 
1635         if ( !highmem_start )
1636             xenheap_max_mfn(limit);
1637 
1638         end_boot_allocator();
1639 
1640         /* Pass the remaining memory to the allocator. */
1641         for ( i = 0; i < boot_e820.nr_map; i++ )
1642         {
1643             uint64_t s, e;
1644 
1645             if ( boot_e820.map[i].type != E820_RAM )
1646                 continue;
1647             s = (boot_e820.map[i].addr + mask) & ~mask;
1648             e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1649             if ( PFN_DOWN(e) <= limit )
1650                 continue;
1651             if ( PFN_DOWN(s) <= limit )
1652                 s = pfn_to_paddr(limit + 1);
1653             init_domheap_pages(s, e);
1654         }
1655     }
1656     else
1657         end_boot_allocator();
1658 
1659     system_state = SYS_STATE_boot;
1660     /*
1661      * No calls involving ACPI code should go between the setting of
1662      * SYS_STATE_boot and vm_init() (or else acpi_os_{,un}map_memory()
1663      * will break).
1664      */
1665     vm_init();
1666 
1667     console_init_ring();
1668     vesa_init();
1669 
1670     tasklet_subsys_init();
1671 
1672     paging_init();
1673 
1674     tboot_probe();
1675 
1676     open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
1677 
1678     if ( opt_watchdog )
1679         nmi_watchdog = NMI_LOCAL_APIC;
1680 
1681     find_smp_config();
1682 
1683     dmi_scan_machine();
1684 
1685     generic_apic_probe();
1686 
1687     mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
1688                                   RANGESETF_prettyprint_hex);
1689 
1690     xsm_multiboot_init(module_map, mbi);
1691 
1692     setup_system_domains();
1693 
1694     acpi_boot_init();
1695 
1696     if ( smp_found_config )
1697         get_smp_config();
1698 
1699     /*
1700      * In the shim case, the number of CPUs should be solely controlled by the
1701      * guest configuration file.
1702      */
1703     if ( pv_shim )
1704     {
1705         opt_nosmp = false;
1706         max_cpus = 0;
1707     }
1708     if ( opt_nosmp )
1709     {
1710         max_cpus = 0;
1711         set_nr_cpu_ids(1);
1712     }
1713     else
1714     {
1715         set_nr_cpu_ids(max_cpus);
1716         if ( !max_cpus )
1717             max_cpus = nr_cpu_ids;
1718     }
1719 
1720     if ( hypervisor_name )
1721         hypervisor_setup();
1722 
1723     /* Low mappings were only needed for some BIOS table parsing. */
1724     zap_low_mappings();
1725 
1726     init_apic_mappings();
1727 
1728     normalise_cpu_order();
1729 
1730     init_cpu_to_node();
1731 
1732     x2apic_bsp_setup();
1733 
1734     ret = init_irq_data();
1735     if ( ret < 0 )
1736         panic("Error %d setting up IRQ data\n", ret);
1737 
1738     console_init_irq();
1739 
1740     init_IRQ();
1741 
1742     microcode_grab_module(module_map, mbi);
1743 
1744     timer_init();
1745 
1746     early_microcode_init();
1747 
1748     tsx_init(); /* Needs microcode.  May change HLE/RTM feature bits. */
1749 
1750     identify_cpu(&boot_cpu_data);
1751 
1752     set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
1753 
1754     /* Do not enable SMEP/SMAP in PV shim on AMD and Hygon by default */
1755     if ( opt_smep == -1 )
1756         opt_smep = !pv_shim || !(boot_cpu_data.x86_vendor &
1757                                  (X86_VENDOR_AMD | X86_VENDOR_HYGON));
1758     if ( opt_smap == -1 )
1759         opt_smap = !pv_shim || !(boot_cpu_data.x86_vendor &
1760                                  (X86_VENDOR_AMD | X86_VENDOR_HYGON));
1761 
1762     if ( !opt_smep )
1763         setup_clear_cpu_cap(X86_FEATURE_SMEP);
1764     if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY )
1765         setup_force_cpu_cap(X86_FEATURE_XEN_SMEP);
1766     if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) )
1767         set_in_cr4(X86_CR4_SMEP);
1768 
1769     if ( !opt_smap )
1770         setup_clear_cpu_cap(X86_FEATURE_SMAP);
1771     if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY )
1772         setup_force_cpu_cap(X86_FEATURE_XEN_SMAP);
1773     if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
1774         set_in_cr4(X86_CR4_SMAP);
1775 
1776     cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
1777 
1778     if ( boot_cpu_has(X86_FEATURE_FSGSBASE) )
1779         set_in_cr4(X86_CR4_FSGSBASE);
1780 
1781     if ( opt_invpcid && cpu_has_invpcid )
1782         use_invpcid = true;
1783 
1784     init_speculation_mitigations();
1785 
1786     init_idle_domain();
1787 
1788     this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
1789                                            &this_cpu(stubs).mfn);
1790     BUG_ON(!this_cpu(stubs.addr));
1791 
1792     trap_init();
1793 
1794     rcu_init();
1795 
1796     early_time_init();
1797 
1798     arch_init_memory();
1799 
1800     alternative_instructions();
1801 
1802     local_irq_enable();
1803 
1804     vesa_mtrr_init();
1805 
1806     early_msi_init();
1807 
1808     iommu_setup();    /* setup iommu if available */
1809 
1810     smp_prepare_cpus();
1811 
1812     spin_debug_enable();
1813 
1814     /*
1815      * Initialise higher-level timer functions. We do this fairly late
1816      * (after interrupts got enabled) because the time bases and scale
1817      * factors need to be updated regularly.
1818      */
1819     init_xen_time();
1820 
1821     initialize_keytable();
1822 
1823     console_init_postirq();
1824 
1825     system_state = SYS_STATE_smp_boot;
1826 
1827     do_presmp_initcalls();
1828 
1829     alternative_branches();
1830 
1831     /* Defer CR4.CET until alternatives have finished playing with CR0.WP */
1832     if ( cpu_has_xen_shstk )
1833         set_in_cr4(X86_CR4_CET);
1834 
1835     /*
1836      * NB: when running as a PV shim VCPUOP_up/down is wired to the shim
1837      * physical cpu_add/remove functions, so launch the guest with only
1838      * the BSP online and let it bring up the other CPUs as required.
1839      */
1840     if ( !pv_shim )
1841     {
1842         for_each_present_cpu ( i )
1843         {
1844             /* Set up cpu_to_node[]. */
1845             srat_detect_node(i);
1846             /* Set up node_to_cpumask based on cpu_to_node[]. */
1847             numa_add_cpu(i);
1848 
1849             if ( (park_offline_cpus || num_online_cpus() < max_cpus) &&
1850                  !cpu_online(i) )
1851             {
1852                 ret = cpu_up(i);
1853                 if ( ret != 0 )
1854                     printk("Failed to bring up CPU %u (error %d)\n", i, ret);
1855                 else if ( num_online_cpus() > max_cpus ||
1856                           (!opt_smt &&
1857                            cpu_data[i].compute_unit_id == INVALID_CUID &&
1858                            cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) )
1859                 {
1860                     ret = cpu_down(i);
1861                     if ( !ret )
1862                         ++num_parked;
1863                     else
1864                         printk("Could not re-offline CPU%u (%d)\n", i, ret);
1865                 }
1866             }
1867         }
1868     }
1869 
1870     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
1871     if ( num_parked )
1872         printk(XENLOG_INFO "Parked %u CPUs\n", num_parked);
1873     smp_cpus_done();
1874 
1875     do_initcalls();
1876 
1877     if ( opt_watchdog )
1878         watchdog_setup();
1879 
1880     if ( !tboot_protect_mem_regions() )
1881         panic("Could not protect TXT memory regions\n");
1882 
1883     init_guest_cpuid();
1884     init_guest_msr_policy();
1885 
1886     if ( xen_cpuidle )
1887         xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
1888 
1889     printk("%sNX (Execute Disable) protection %sactive\n",
1890            cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
1891            cpu_has_nx ? "" : "not ");
1892 
1893     initrdidx = find_first_bit(module_map, mbi->mods_count);
1894     if ( bitmap_weight(module_map, mbi->mods_count) > 1 )
1895         printk(XENLOG_WARNING
1896                "Multiple initrd candidates, picking module #%u\n",
1897                initrdidx);
1898 
1899     /*
1900      * We're going to setup domain0 using the module(s) that we stashed safely
1901      * above our heap. The second module, if present, is an initrd ramdisk.
1902      */
1903     dom0 = create_dom0(mod, modules_headroom,
1904                        initrdidx < mbi->mods_count ? mod + initrdidx : NULL,
1905                        kextra, loader);
1906     if ( !dom0 )
1907         panic("Could not set up DOM0 guest OS\n");
1908 
1909     heap_init_late();
1910 
1911     init_trace_bufs();
1912 
1913     init_constructors();
1914 
1915     console_endboot();
1916 
1917     /* Hide UART from DOM0 if we're using it */
1918     serial_endboot();
1919 
1920     dmi_end_boot();
1921 
1922     setup_io_bitmap(dom0);
1923 
1924     if ( bsp_delay_spec_ctrl )
1925     {
1926         get_cpu_info()->spec_ctrl_flags &= ~SCF_use_shadow;
1927         barrier();
1928         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
1929     }
1930 
1931     /* Jump to the 1:1 virtual mappings of cpu0_stack. */
1932     asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
1933                   [stk] "g" (__va(__pa(get_stack_bottom()))),
1934                   [fn] "i" (reinit_bsp_stack) : "memory");
1935     unreachable();
1936 }
1937 
arch_get_xen_caps(xen_capabilities_info_t * info)1938 void arch_get_xen_caps(xen_capabilities_info_t *info)
1939 {
1940     /* Interface name is always xen-3.0-* for Xen-3.x. */
1941     int major = 3, minor = 0;
1942     char s[32];
1943 
1944     (*info)[0] = '\0';
1945 
1946     if ( IS_ENABLED(CONFIG_PV) )
1947     {
1948         snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1949         safe_strcat(*info, s);
1950 
1951         if ( opt_pv32 )
1952         {
1953             snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1954             safe_strcat(*info, s);
1955         }
1956     }
1957     if ( hvm_enabled )
1958     {
1959         snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1960         safe_strcat(*info, s);
1961         snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1962         safe_strcat(*info, s);
1963         snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1964         safe_strcat(*info, s);
1965     }
1966 }
1967 
xen_in_range(unsigned long mfn)1968 int __hwdom_init xen_in_range(unsigned long mfn)
1969 {
1970     paddr_t start, end;
1971     int i;
1972 
1973     enum { region_s3, region_ro, region_rw, nr_regions };
1974     static struct {
1975         paddr_t s, e;
1976     } xen_regions[nr_regions] __hwdom_initdata;
1977 
1978     /* initialize first time */
1979     if ( !xen_regions[0].s )
1980     {
1981         /* S3 resume code (and other real mode trampoline code) */
1982         xen_regions[region_s3].s = bootsym_phys(trampoline_start);
1983         xen_regions[region_s3].e = bootsym_phys(trampoline_end);
1984 
1985         /*
1986          * This needs to remain in sync with the uses of the same symbols in
1987          * - __start_xen() (above)
1988          * - is_xen_fixed_mfn()
1989          * - tboot_shutdown()
1990          */
1991 
1992         /* hypervisor .text + .rodata */
1993         xen_regions[region_ro].s = __pa(&_stext);
1994         xen_regions[region_ro].e = __pa(&__2M_rodata_end);
1995         /* hypervisor .data + .bss */
1996         xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
1997         xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
1998     }
1999 
2000     start = (paddr_t)mfn << PAGE_SHIFT;
2001     end = start + PAGE_SIZE;
2002     for ( i = 0; i < nr_regions; i++ )
2003         if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
2004             return 1;
2005 
2006     return 0;
2007 }
2008 
io_bitmap_cb(unsigned long s,unsigned long e,void * ctx)2009 static int __hwdom_init io_bitmap_cb(unsigned long s, unsigned long e,
2010                                      void *ctx)
2011 {
2012     struct domain *d = ctx;
2013     unsigned int i;
2014 
2015     ASSERT(e <= INT_MAX);
2016     for ( i = s; i <= e; i++ )
2017         __clear_bit(i, d->arch.hvm.io_bitmap);
2018 
2019     return 0;
2020 }
2021 
setup_io_bitmap(struct domain * d)2022 void __hwdom_init setup_io_bitmap(struct domain *d)
2023 {
2024     int rc;
2025 
2026     if ( is_hvm_domain(d) )
2027     {
2028         bitmap_fill(d->arch.hvm.io_bitmap, 0x10000);
2029         rc = rangeset_report_ranges(d->arch.ioport_caps, 0, 0x10000,
2030                                     io_bitmap_cb, d);
2031         BUG_ON(rc);
2032         /*
2033          * NB: we need to trap accesses to 0xcf8 in order to intercept
2034          * 4 byte accesses, that need to be handled by Xen in order to
2035          * keep consistency.
2036          * Access to 1 byte RTC ports also needs to be trapped in order
2037          * to keep consistency with PV.
2038          */
2039         __set_bit(0xcf8, d->arch.hvm.io_bitmap);
2040         __set_bit(RTC_PORT(0), d->arch.hvm.io_bitmap);
2041         __set_bit(RTC_PORT(1), d->arch.hvm.io_bitmap);
2042     }
2043 }
2044 
2045 /*
2046  * Local variables:
2047  * mode: C
2048  * c-file-style: "BSD"
2049  * c-basic-offset: 4
2050  * tab-width: 4
2051  * indent-tabs-mode: nil
2052  * End:
2053  */
2054