1 /*
2  * x86 SMP booting functions
3  *
4  * This inherits a great deal from Linux's SMP boot code:
5  *  (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6  *  (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; If not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include <xen/init.h>
23 #include <xen/kernel.h>
24 #include <xen/mm.h>
25 #include <xen/domain.h>
26 #include <xen/domain_page.h>
27 #include <xen/sched.h>
28 #include <xen/irq.h>
29 #include <xen/delay.h>
30 #include <xen/softirq.h>
31 #include <xen/tasklet.h>
32 #include <xen/serial.h>
33 #include <xen/numa.h>
34 #include <xen/cpu.h>
35 #include <asm/cpuidle.h>
36 #include <asm/current.h>
37 #include <asm/mc146818rtc.h>
38 #include <asm/desc.h>
39 #include <asm/div64.h>
40 #include <asm/flushtlb.h>
41 #include <asm/guest.h>
42 #include <asm/microcode.h>
43 #include <asm/msr.h>
44 #include <asm/mtrr.h>
45 #include <asm/spec_ctrl.h>
46 #include <asm/time.h>
47 #include <asm/tboot.h>
48 #include <irq_vectors.h>
49 #include <mach_apic.h>
50 
51 unsigned long __read_mostly trampoline_phys;
52 
53 /* representing HT siblings of each logical CPU */
54 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask);
55 /* representing HT and core siblings of each logical CPU */
56 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
57 
58 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
59 static cpumask_t scratch_cpu0mask;
60 
61 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, send_ipi_cpumask);
62 static cpumask_t send_ipi_cpu0mask;
63 
64 cpumask_t cpu_online_map __read_mostly;
65 EXPORT_SYMBOL(cpu_online_map);
66 
67 bool __read_mostly park_offline_cpus;
68 
69 unsigned int __read_mostly nr_sockets;
70 cpumask_t **__read_mostly socket_cpumask;
71 static cpumask_t *secondary_socket_cpumask;
72 
73 struct cpuinfo_x86 cpu_data[NR_CPUS];
74 
75 u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
76 	{ [0 ... NR_CPUS-1] = BAD_APICID };
77 
78 static int cpu_error;
79 static enum cpu_state {
80     CPU_STATE_DYING,    /* slave -> master: I am dying */
81     CPU_STATE_DEAD,     /* slave -> master: I am completely dead */
82     CPU_STATE_INIT,     /* master -> slave: Early bringup phase 1 */
83     CPU_STATE_CALLOUT,  /* master -> slave: Early bringup phase 2 */
84     CPU_STATE_CALLIN,   /* slave -> master: Completed phase 2 */
85     CPU_STATE_ONLINE    /* master -> slave: Go fully online now. */
86 } cpu_state;
87 #define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0)
88 
89 void *stack_base[NR_CPUS];
90 
initialize_cpu_data(unsigned int cpu)91 void initialize_cpu_data(unsigned int cpu)
92 {
93     cpu_data[cpu] = boot_cpu_data;
94 }
95 
smp_store_cpu_info(unsigned int id)96 static bool smp_store_cpu_info(unsigned int id)
97 {
98     unsigned int socket;
99 
100     if ( system_state != SYS_STATE_resume )
101         identify_cpu(&cpu_data[id]);
102     else if ( !recheck_cpu_features(id) )
103         return false;
104 
105     socket = cpu_to_socket(id);
106     if ( !socket_cpumask[socket] )
107     {
108         socket_cpumask[socket] = secondary_socket_cpumask;
109         secondary_socket_cpumask = NULL;
110     }
111 
112     return true;
113 }
114 
115 /*
116  * TSC's upper 32 bits can't be written in earlier CPUs (before
117  * Prescott), there is no way to resync one AP against BP.
118  */
119 bool disable_tsc_sync;
120 
121 static atomic_t tsc_count;
122 static uint64_t tsc_value;
123 static cpumask_t tsc_sync_cpu_mask;
124 
synchronize_tsc_master(unsigned int slave)125 static void synchronize_tsc_master(unsigned int slave)
126 {
127     unsigned int i;
128 
129     if ( disable_tsc_sync )
130         return;
131 
132     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
133          !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
134         return;
135 
136     for ( i = 1; i <= 5; i++ )
137     {
138         tsc_value = rdtsc_ordered();
139         smp_wmb();
140         atomic_inc(&tsc_count);
141         while ( atomic_read(&tsc_count) != (i<<1) )
142             cpu_relax();
143     }
144 
145     atomic_set(&tsc_count, 0);
146     cpumask_clear_cpu(slave, &tsc_sync_cpu_mask);
147 }
148 
synchronize_tsc_slave(unsigned int slave)149 static void synchronize_tsc_slave(unsigned int slave)
150 {
151     unsigned int i;
152 
153     if ( disable_tsc_sync )
154         return;
155 
156     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
157          !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
158         return;
159 
160     for ( i = 1; i <= 5; i++ )
161     {
162         while ( atomic_read(&tsc_count) != ((i<<1)-1) )
163             cpu_relax();
164         smp_rmb();
165         /*
166          * If a CPU has been physically hotplugged, we may as well write
167          * to its TSC in spite of X86_FEATURE_TSC_RELIABLE. The platform does
168          * not sync up a new CPU's TSC for us.
169          */
170         __write_tsc(tsc_value);
171         atomic_inc(&tsc_count);
172     }
173 }
174 
smp_callin(void)175 static void smp_callin(void)
176 {
177     unsigned int cpu = smp_processor_id();
178     int i, rc;
179 
180     /* Wait 2s total for startup. */
181     Dprintk("Waiting for CALLOUT.\n");
182     for ( i = 0; cpu_state != CPU_STATE_CALLOUT; i++ )
183     {
184         BUG_ON(i >= 200);
185         cpu_relax();
186         mdelay(10);
187     }
188 
189     /*
190      * The boot CPU has finished the init stage and is spinning on cpu_state
191      * update until we finish. We are free to set up this CPU: first the APIC.
192      */
193     Dprintk("CALLIN, before setup_local_APIC().\n");
194     x2apic_ap_setup();
195     setup_local_APIC(false);
196 
197     /* Save our processor parameters. */
198     if ( !smp_store_cpu_info(cpu) )
199     {
200         printk("CPU%u: Failed to validate features - not coming back online\n",
201                cpu);
202         cpu_error = -ENXIO;
203         goto halt;
204     }
205 
206     if ( cpu_has_hypervisor && (rc = hypervisor_ap_setup()) != 0 )
207     {
208         printk("CPU%d: Failed to initialise hypervisor functions. Not coming online.\n", cpu);
209         cpu_error = rc;
210         goto halt;
211     }
212 
213     if ( (rc = hvm_cpu_up()) != 0 )
214     {
215         printk("CPU%d: Failed to initialise HVM. Not coming online.\n", cpu);
216         cpu_error = rc;
217     halt:
218         clear_local_APIC();
219         spin_debug_enable();
220         play_dead();
221     }
222 
223     /* Allow the master to continue. */
224     set_cpu_state(CPU_STATE_CALLIN);
225 
226     synchronize_tsc_slave(cpu);
227 
228     /* And wait for our final Ack. */
229     while ( cpu_state != CPU_STATE_ONLINE )
230         cpu_relax();
231 }
232 
233 static int booting_cpu;
234 
235 /* CPUs for which sibling maps can be computed. */
236 static cpumask_t cpu_sibling_setup_map;
237 
link_thread_siblings(int cpu1,int cpu2)238 static void link_thread_siblings(int cpu1, int cpu2)
239 {
240     cpumask_set_cpu(cpu1, per_cpu(cpu_sibling_mask, cpu2));
241     cpumask_set_cpu(cpu2, per_cpu(cpu_sibling_mask, cpu1));
242     cpumask_set_cpu(cpu1, per_cpu(cpu_core_mask, cpu2));
243     cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
244 }
245 
set_cpu_sibling_map(unsigned int cpu)246 static void set_cpu_sibling_map(unsigned int cpu)
247 {
248     unsigned int i;
249     struct cpuinfo_x86 *c = cpu_data;
250 
251     cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
252 
253     cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
254     cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu));
255     cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
256 
257     if ( c[cpu].x86_num_siblings > 1 )
258     {
259         for_each_cpu ( i, &cpu_sibling_setup_map )
260         {
261             if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id )
262                 continue;
263             if ( c[cpu].compute_unit_id != INVALID_CUID &&
264                  c[i].compute_unit_id != INVALID_CUID )
265             {
266                 if ( c[cpu].compute_unit_id == c[i].compute_unit_id )
267                     link_thread_siblings(cpu, i);
268             }
269             else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID &&
270                       c[i].cpu_core_id != XEN_INVALID_CORE_ID )
271             {
272                 if ( c[cpu].cpu_core_id == c[i].cpu_core_id )
273                     link_thread_siblings(cpu, i);
274             }
275             else
276                 printk(XENLOG_WARNING
277                        "CPU%u: unclear relationship with CPU%u\n",
278                        cpu, i);
279         }
280     }
281 
282     if ( c[cpu].x86_max_cores == 1 )
283     {
284         cpumask_copy(per_cpu(cpu_core_mask, cpu),
285                      per_cpu(cpu_sibling_mask, cpu));
286         c[cpu].booted_cores = 1;
287         return;
288     }
289 
290     for_each_cpu ( i, &cpu_sibling_setup_map )
291     {
292         if ( c[cpu].phys_proc_id == c[i].phys_proc_id )
293         {
294             cpumask_set_cpu(i, per_cpu(cpu_core_mask, cpu));
295             cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, i));
296             /*
297              *  Does this new cpu bringup a new core?
298              */
299             if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
300             {
301                 /*
302                  * for each core in package, increment
303                  * the booted_cores for this new cpu
304                  */
305                 if ( cpumask_first(per_cpu(cpu_sibling_mask, i)) == i )
306                     c[cpu].booted_cores++;
307                 /*
308                  * increment the core count for all
309                  * the other cpus in this package
310                  */
311                 if ( i != cpu )
312                     c[i].booted_cores++;
313             }
314             else if ( (i != cpu) && !c[cpu].booted_cores )
315             {
316                 c[cpu].booted_cores = c[i].booted_cores;
317             }
318         }
319     }
320 }
321 
start_secondary(void * unused)322 void start_secondary(void *unused)
323 {
324     /*
325      * Dont put anything before smp_callin(), SMP booting is so fragile that we
326      * want to limit the things done here to the most necessary things.
327      */
328     unsigned int cpu = booting_cpu;
329 
330     /* Critical region without IDT or TSS.  Any fault is deadly! */
331 
332     set_current(idle_vcpu[cpu]);
333     this_cpu(curr_vcpu) = idle_vcpu[cpu];
334     rdmsrl(MSR_EFER, this_cpu(efer));
335     init_shadow_spec_ctrl_state();
336 
337     /*
338      * Just as during early bootstrap, it is convenient here to disable
339      * spinlock checking while we have IRQs disabled. This allows us to
340      * acquire IRQ-unsafe locks when it would otherwise be disallowed.
341      *
342      * It is safe because the race we are usually trying to avoid involves
343      * a group of CPUs rendezvousing in an IPI handler, where one cannot
344      * join because it is spinning with IRQs disabled waiting to acquire a
345      * lock held by another in the rendezvous group (the lock must be an
346      * IRQ-unsafe lock since the CPU took the IPI after acquiring it, and
347      * hence had IRQs enabled). This is a deadlock scenario.
348      *
349      * However, no CPU can be involved in rendezvous until it is online,
350      * hence no such group can be waiting for this CPU until it is
351      * visible in cpu_online_map. Hence such a deadlock is not possible.
352      */
353     spin_debug_disable();
354 
355     get_cpu_info()->use_pv_cr3 = false;
356     get_cpu_info()->xen_cr3 = 0;
357     get_cpu_info()->pv_cr3 = 0;
358 
359     load_system_tables();
360 
361     /* Full exception support from here on in. */
362 
363     /* Safe to enable feature such as CR4.MCE with the IDT set up now. */
364     write_cr4(mmu_cr4_features);
365 
366     percpu_traps_init();
367 
368     cpu_init();
369 
370     initialize_cpu_data(cpu);
371 
372     microcode_update_one();
373 
374     /*
375      * If any speculative control MSRs are available, apply Xen's default
376      * settings.  Note: These MSRs may only become available after loading
377      * microcode.
378      */
379     if ( boot_cpu_has(X86_FEATURE_IBRSB) )
380         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
381     if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )
382         wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl);
383 
384     tsx_init(); /* Needs microcode.  May change HLE/RTM feature bits. */
385 
386     smp_callin();
387 
388     set_cpu_sibling_map(cpu);
389 
390     init_percpu_time();
391 
392     setup_secondary_APIC_clock();
393 
394     /*
395      * low-memory mappings have been cleared, flush them from
396      * the local TLBs too.
397      */
398     flush_tlb_local();
399 
400     /* This must be done before setting cpu_online_map */
401     spin_debug_enable();
402     notify_cpu_starting(cpu);
403 
404     /*
405      * We need to hold vector_lock so there the set of online cpus
406      * does not change while we are assigning vectors to cpus.  Holding
407      * this lock ensures we don't half assign or remove an irq from a cpu.
408      */
409     lock_vector_lock();
410     setup_vector_irq(cpu);
411     cpumask_set_cpu(cpu, &cpu_online_map);
412     unlock_vector_lock();
413 
414     /* We can take interrupts now: we're officially "up". */
415     local_irq_enable();
416     mtrr_ap_init();
417 
418     startup_cpu_idle_loop();
419 }
420 
421 extern void *stack_start;
422 
wakeup_secondary_cpu(int phys_apicid,unsigned long start_eip)423 static int wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
424 {
425     unsigned long send_status = 0, accept_status = 0;
426     int maxlvt, timeout, i;
427 
428     /*
429      * Be paranoid about clearing APIC errors.
430      */
431     apic_write(APIC_ESR, 0);
432     apic_read(APIC_ESR);
433 
434     Dprintk("Asserting INIT.\n");
435 
436     /*
437      * Turn INIT on target chip via IPI
438      */
439     apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
440                    phys_apicid);
441 
442     if ( !x2apic_enabled )
443     {
444         Dprintk("Waiting for send to finish...\n");
445         timeout = 0;
446         do {
447             Dprintk("+");
448             udelay(100);
449             send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
450         } while ( send_status && (timeout++ < 1000) );
451 
452         mdelay(10);
453 
454         Dprintk("Deasserting INIT.\n");
455 
456         apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
457 
458         Dprintk("Waiting for send to finish...\n");
459         timeout = 0;
460         do {
461             Dprintk("+");
462             udelay(100);
463             send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
464         } while ( send_status && (timeout++ < 1000) );
465     }
466     else if ( tboot_in_measured_env() )
467     {
468         /*
469          * With tboot AP is actually spinning in a mini-guest before
470          * receiving INIT. Upon receiving INIT ipi, AP need time to VMExit,
471          * update VMCS to tracking SIPIs and VMResume.
472          *
473          * While AP is in root mode handling the INIT the CPU will drop
474          * any SIPIs
475          */
476         udelay(10);
477     }
478 
479     maxlvt = get_maxlvt();
480 
481     for ( i = 0; i < 2; i++ )
482     {
483         Dprintk("Sending STARTUP #%d.\n", i+1);
484         apic_write(APIC_ESR, 0);
485         apic_read(APIC_ESR);
486         Dprintk("After apic_write.\n");
487 
488         /*
489          * STARTUP IPI
490          * Boot on the stack
491          */
492         apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
493 
494         if ( !x2apic_enabled )
495         {
496             /* Give the other CPU some time to accept the IPI. */
497             udelay(300);
498 
499             Dprintk("Startup point 1.\n");
500 
501             Dprintk("Waiting for send to finish...\n");
502             timeout = 0;
503             do {
504                 Dprintk("+");
505                 udelay(100);
506                 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
507             } while ( send_status && (timeout++ < 1000) );
508 
509             /* Give the other CPU some time to accept the IPI. */
510             udelay(200);
511         }
512 
513         /* Due to the Pentium erratum 3AP. */
514         if ( maxlvt > 3 )
515         {
516             apic_write(APIC_ESR, 0);
517         }
518         accept_status = (apic_read(APIC_ESR) & 0xEF);
519         if ( send_status || accept_status )
520             break;
521     }
522     Dprintk("After Startup.\n");
523 
524     if ( send_status )
525         printk("APIC never delivered???\n");
526     if ( accept_status )
527         printk("APIC delivery error (%lx).\n", accept_status);
528 
529     return (send_status | accept_status);
530 }
531 
alloc_cpu_id(void)532 int alloc_cpu_id(void)
533 {
534     cpumask_t tmp_map;
535     int cpu;
536 
537     cpumask_complement(&tmp_map, &cpu_present_map);
538     cpu = cpumask_first(&tmp_map);
539     return (cpu < nr_cpu_ids) ? cpu : -ENODEV;
540 }
541 
do_boot_cpu(int apicid,int cpu)542 static int do_boot_cpu(int apicid, int cpu)
543 {
544     int timeout, boot_error = 0, rc = 0;
545     unsigned long start_eip;
546 
547     /*
548      * Save current MTRR state in case it was changed since early boot
549      * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
550      */
551     mtrr_save_state();
552 
553     booting_cpu = cpu;
554 
555     start_eip = bootsym_phys(trampoline_realmode_entry);
556 
557     /* start_eip needs be page aligned, and below the 1M boundary. */
558     if ( start_eip & ~0xff000 )
559         panic("AP trampoline %#lx not suitably positioned\n", start_eip);
560 
561     /* So we see what's up   */
562     if ( opt_cpu_info )
563         printk("Booting processor %d/%d eip %lx\n",
564                cpu, apicid, start_eip);
565 
566     stack_start = stack_base[cpu] + STACK_SIZE - sizeof(struct cpu_info);
567 
568     /* This grunge runs the startup process for the targeted processor. */
569 
570     set_cpu_state(CPU_STATE_INIT);
571 
572     /* Starting actual IPI sequence... */
573     if ( !tboot_in_measured_env() || tboot_wake_ap(apicid, start_eip) )
574         boot_error = wakeup_secondary_cpu(apicid, start_eip);
575 
576     if ( !boot_error )
577     {
578         /* Allow AP to start initializing. */
579         set_cpu_state(CPU_STATE_CALLOUT);
580         Dprintk("After Callout %d.\n", cpu);
581 
582         /* Wait 5s total for a response. */
583         for ( timeout = 0; timeout < 50000; timeout++ )
584         {
585             if ( cpu_state != CPU_STATE_CALLOUT )
586                 break;
587             udelay(100);
588         }
589 
590         if ( cpu_state == CPU_STATE_CALLIN )
591         {
592             /* number CPUs logically, starting from 1 (BSP is 0) */
593             Dprintk("OK.\n");
594             print_cpu_info(cpu);
595             synchronize_tsc_master(cpu);
596             Dprintk("CPU has booted.\n");
597         }
598         else if ( cpu_state == CPU_STATE_DEAD )
599         {
600             smp_rmb();
601             rc = cpu_error;
602         }
603         else
604         {
605             boot_error = 1;
606             smp_mb();
607             if ( bootsym(trampoline_cpu_started) == 0xA5 )
608                 /* trampoline started but...? */
609                 printk("Stuck ??\n");
610             else
611                 /* trampoline code not run */
612                 printk("Not responding.\n");
613         }
614     }
615 
616     if ( boot_error )
617     {
618         cpu_exit_clear(cpu);
619         rc = -EIO;
620     }
621 
622     /* mark "stuck" area as not stuck */
623     bootsym(trampoline_cpu_started) = 0;
624     smp_mb();
625 
626     return rc;
627 }
628 
629 #define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
630 
alloc_stub_page(unsigned int cpu,unsigned long * mfn)631 unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
632 {
633     unsigned long stub_va;
634     struct page_info *pg;
635 
636     BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
637 
638     if ( *mfn )
639         pg = mfn_to_page(_mfn(*mfn));
640     else
641     {
642         nodeid_t node = cpu_to_node(cpu);
643         unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
644 
645         pg = alloc_domheap_page(NULL, memflags);
646         if ( !pg )
647             return 0;
648 
649         unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
650     }
651 
652     stub_va = XEN_VIRT_END - FIXADDR_X_SIZE - (cpu + 1) * PAGE_SIZE;
653     if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
654                           PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
655     {
656         if ( !*mfn )
657             free_domheap_page(pg);
658         stub_va = 0;
659     }
660     else if ( !*mfn )
661         *mfn = mfn_x(page_to_mfn(pg));
662 
663     return stub_va;
664 }
665 
cpu_exit_clear(unsigned int cpu)666 void cpu_exit_clear(unsigned int cpu)
667 {
668     cpu_uninit(cpu);
669     set_cpu_state(CPU_STATE_DEAD);
670 }
671 
clone_mapping(const void * ptr,root_pgentry_t * rpt)672 static int clone_mapping(const void *ptr, root_pgentry_t *rpt)
673 {
674     unsigned long linear = (unsigned long)ptr, pfn;
675     unsigned int flags;
676     l3_pgentry_t *pl3e;
677     l2_pgentry_t *pl2e;
678     l1_pgentry_t *pl1e;
679 
680     /*
681      * Sanity check 'linear'.  We only allow cloning from the Xen virtual
682      * range, and in particular, only from the directmap and .text ranges.
683      */
684     if ( root_table_offset(linear) > ROOT_PAGETABLE_LAST_XEN_SLOT ||
685          root_table_offset(linear) < ROOT_PAGETABLE_FIRST_XEN_SLOT )
686         return -EINVAL;
687 
688     if ( linear < XEN_VIRT_START ||
689          (linear >= XEN_VIRT_END && linear < DIRECTMAP_VIRT_START) )
690         return -EINVAL;
691 
692     pl3e = l4e_to_l3e(idle_pg_table[root_table_offset(linear)]) +
693         l3_table_offset(linear);
694 
695     flags = l3e_get_flags(*pl3e);
696     ASSERT(flags & _PAGE_PRESENT);
697     if ( flags & _PAGE_PSE )
698     {
699         pfn = (l3e_get_pfn(*pl3e) & ~((1UL << (2 * PAGETABLE_ORDER)) - 1)) |
700               (PFN_DOWN(linear) & ((1UL << (2 * PAGETABLE_ORDER)) - 1));
701         flags &= ~_PAGE_PSE;
702     }
703     else
704     {
705         pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(linear);
706         flags = l2e_get_flags(*pl2e);
707         ASSERT(flags & _PAGE_PRESENT);
708         if ( flags & _PAGE_PSE )
709         {
710             pfn = (l2e_get_pfn(*pl2e) & ~((1UL << PAGETABLE_ORDER) - 1)) |
711                   (PFN_DOWN(linear) & ((1UL << PAGETABLE_ORDER) - 1));
712             flags &= ~_PAGE_PSE;
713         }
714         else
715         {
716             pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(linear);
717             flags = l1e_get_flags(*pl1e);
718             if ( !(flags & _PAGE_PRESENT) )
719                 return 0;
720             pfn = l1e_get_pfn(*pl1e);
721         }
722     }
723 
724     if ( !(root_get_flags(rpt[root_table_offset(linear)]) & _PAGE_PRESENT) )
725     {
726         pl3e = alloc_xen_pagetable();
727         if ( !pl3e )
728             return -ENOMEM;
729         clear_page(pl3e);
730         l4e_write(&rpt[root_table_offset(linear)],
731                   l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
732     }
733     else
734         pl3e = l4e_to_l3e(rpt[root_table_offset(linear)]);
735 
736     pl3e += l3_table_offset(linear);
737 
738     if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
739     {
740         pl2e = alloc_xen_pagetable();
741         if ( !pl2e )
742             return -ENOMEM;
743         clear_page(pl2e);
744         l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
745     }
746     else
747     {
748         ASSERT(!(l3e_get_flags(*pl3e) & _PAGE_PSE));
749         pl2e = l3e_to_l2e(*pl3e);
750     }
751 
752     pl2e += l2_table_offset(linear);
753 
754     if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
755     {
756         pl1e = alloc_xen_pagetable();
757         if ( !pl1e )
758             return -ENOMEM;
759         clear_page(pl1e);
760         l2e_write(pl2e, l2e_from_paddr(__pa(pl1e), __PAGE_HYPERVISOR));
761     }
762     else
763     {
764         ASSERT(!(l2e_get_flags(*pl2e) & _PAGE_PSE));
765         pl1e = l2e_to_l1e(*pl2e);
766     }
767 
768     pl1e += l1_table_offset(linear);
769     flags &= ~_PAGE_GLOBAL;
770 
771     if ( l1e_get_flags(*pl1e) & _PAGE_PRESENT )
772     {
773         ASSERT(l1e_get_pfn(*pl1e) == pfn);
774         ASSERT(l1e_get_flags(*pl1e) == flags);
775     }
776     else
777         l1e_write(pl1e, l1e_from_pfn(pfn, flags));
778 
779     return 0;
780 }
781 
782 DEFINE_PER_CPU(root_pgentry_t *, root_pgt);
783 
784 static root_pgentry_t common_pgt;
785 
786 extern const char _stextentry[], _etextentry[];
787 
setup_cpu_root_pgt(unsigned int cpu)788 static int setup_cpu_root_pgt(unsigned int cpu)
789 {
790     root_pgentry_t *rpt;
791     unsigned int off;
792     int rc;
793 
794     if ( !opt_xpti_hwdom && !opt_xpti_domu )
795         return 0;
796 
797     rpt = alloc_xen_pagetable();
798     if ( !rpt )
799         return -ENOMEM;
800 
801     clear_page(rpt);
802     per_cpu(root_pgt, cpu) = rpt;
803 
804     rpt[root_table_offset(RO_MPT_VIRT_START)] =
805         idle_pg_table[root_table_offset(RO_MPT_VIRT_START)];
806     /* SH_LINEAR_PT inserted together with guest mappings. */
807     /* PERDOMAIN inserted during context switch. */
808 
809     /* One-time setup of common_pgt, which maps .text.entry and the stubs. */
810     if ( unlikely(!root_get_intpte(common_pgt)) )
811     {
812         const char *ptr;
813 
814         for ( rc = 0, ptr = _stextentry;
815               !rc && ptr < _etextentry; ptr += PAGE_SIZE )
816             rc = clone_mapping(ptr, rpt);
817 
818         if ( rc )
819             return rc;
820 
821         common_pgt = rpt[root_table_offset(XEN_VIRT_START)];
822     }
823 
824     rpt[root_table_offset(XEN_VIRT_START)] = common_pgt;
825 
826     /* Install direct map page table entries for stack, IDT, and TSS. */
827     for ( off = rc = 0; !rc && off < STACK_SIZE; off += PAGE_SIZE )
828         rc = clone_mapping(__va(__pa(stack_base[cpu])) + off, rpt);
829 
830     if ( !rc )
831         rc = clone_mapping(idt_tables[cpu], rpt);
832     if ( !rc )
833     {
834         struct tss_page *ptr = &per_cpu(tss_page, cpu);
835 
836         BUILD_BUG_ON(sizeof(*ptr) != PAGE_SIZE);
837 
838         rc = clone_mapping(&ptr->tss, rpt);
839     }
840     if ( !rc )
841         rc = clone_mapping((void *)per_cpu(stubs.addr, cpu), rpt);
842 
843     return rc;
844 }
845 
cleanup_cpu_root_pgt(unsigned int cpu)846 static void cleanup_cpu_root_pgt(unsigned int cpu)
847 {
848     root_pgentry_t *rpt = per_cpu(root_pgt, cpu);
849     unsigned int r;
850     unsigned long stub_linear = per_cpu(stubs.addr, cpu);
851 
852     if ( !rpt )
853         return;
854 
855     per_cpu(root_pgt, cpu) = NULL;
856 
857     for ( r = root_table_offset(DIRECTMAP_VIRT_START);
858           r < root_table_offset(HYPERVISOR_VIRT_END); ++r )
859     {
860         l3_pgentry_t *l3t;
861         mfn_t l3mfn;
862         unsigned int i3;
863 
864         if ( !(root_get_flags(rpt[r]) & _PAGE_PRESENT) )
865             continue;
866 
867         l3mfn = l4e_get_mfn(rpt[r]);
868         l3t = map_domain_page(l3mfn);
869 
870         for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; ++i3 )
871         {
872             l2_pgentry_t *l2t;
873             mfn_t l2mfn;
874             unsigned int i2;
875 
876             if ( !(l3e_get_flags(l3t[i3]) & _PAGE_PRESENT) )
877                 continue;
878 
879             ASSERT(!(l3e_get_flags(l3t[i3]) & _PAGE_PSE));
880             l2mfn = l3e_get_mfn(l3t[i3]);
881             l2t = map_domain_page(l2mfn);
882 
883             for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; ++i2 )
884             {
885                 if ( !(l2e_get_flags(l2t[i2]) & _PAGE_PRESENT) )
886                     continue;
887 
888                 ASSERT(!(l2e_get_flags(l2t[i2]) & _PAGE_PSE));
889                 free_xen_pagetable_new(l2e_get_mfn(l2t[i2]));
890             }
891 
892             unmap_domain_page(l2t);
893             free_xen_pagetable_new(l2mfn);
894         }
895 
896         unmap_domain_page(l3t);
897         free_xen_pagetable_new(l3mfn);
898     }
899 
900     free_xen_pagetable(rpt);
901 
902     /* Also zap the stub mapping for this CPU. */
903     if ( stub_linear )
904     {
905         l3_pgentry_t l3e = l3e_from_l4e(common_pgt,
906                                         l3_table_offset(stub_linear));
907         l2_pgentry_t l2e = l2e_from_l3e(l3e, l2_table_offset(stub_linear));
908         l1_pgentry_t *l1t = map_l1t_from_l2e(l2e);
909 
910         l1t[l1_table_offset(stub_linear)] = l1e_empty();
911 
912         unmap_domain_page(l1t);
913     }
914 }
915 
916 /*
917  * The 'remove' boolean controls whether a CPU is just getting offlined (and
918  * parked), or outright removed / offlined without parking. Parked CPUs need
919  * things like their stack, GDT, IDT, TSS, and per-CPU data still available.
920  * A few other items, in particular CPU masks, are also retained, as it's
921  * difficult to prove that they're entirely unreferenced from parked CPUs.
922  */
cpu_smpboot_free(unsigned int cpu,bool remove)923 static void cpu_smpboot_free(unsigned int cpu, bool remove)
924 {
925     unsigned int socket = cpu_to_socket(cpu);
926     struct cpuinfo_x86 *c = cpu_data;
927 
928     if ( cpumask_empty(socket_cpumask[socket]) )
929     {
930         xfree(socket_cpumask[socket]);
931         socket_cpumask[socket] = NULL;
932     }
933 
934     cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
935 
936     if ( remove )
937     {
938         c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
939         c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
940         c[cpu].compute_unit_id = INVALID_CUID;
941 
942         FREE_CPUMASK_VAR(per_cpu(cpu_sibling_mask, cpu));
943         FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu));
944         if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
945             FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu));
946         if ( per_cpu(send_ipi_cpumask, cpu) != &send_ipi_cpu0mask )
947             FREE_CPUMASK_VAR(per_cpu(send_ipi_cpumask, cpu));
948     }
949 
950     cleanup_cpu_root_pgt(cpu);
951 
952     if ( per_cpu(stubs.addr, cpu) )
953     {
954         mfn_t mfn = _mfn(per_cpu(stubs.mfn, cpu));
955         unsigned char *stub_page = map_domain_page(mfn);
956         unsigned int i;
957 
958         memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
959         for ( i = 0; i < STUBS_PER_PAGE; ++i )
960             if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
961                 break;
962         unmap_domain_page(stub_page);
963         destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
964                              (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
965         per_cpu(stubs.addr, cpu) = 0;
966         per_cpu(stubs.mfn, cpu) = 0;
967         if ( i == STUBS_PER_PAGE )
968             free_domheap_page(mfn_to_page(mfn));
969     }
970 
971     if ( IS_ENABLED(CONFIG_PV32) )
972         FREE_XENHEAP_PAGE(per_cpu(compat_gdt, cpu));
973 
974     if ( remove )
975     {
976         FREE_XENHEAP_PAGE(per_cpu(gdt, cpu));
977         FREE_XENHEAP_PAGE(idt_tables[cpu]);
978 
979         if ( stack_base[cpu] )
980         {
981             memguard_unguard_stack(stack_base[cpu]);
982             FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER);
983         }
984     }
985 }
986 
cpu_smpboot_alloc(unsigned int cpu)987 static int cpu_smpboot_alloc(unsigned int cpu)
988 {
989     struct cpu_info *info;
990     unsigned int i, memflags = 0;
991     nodeid_t node = cpu_to_node(cpu);
992     seg_desc_t *gdt;
993     unsigned long stub_page;
994     int rc = -ENOMEM;
995 
996     if ( node != NUMA_NO_NODE )
997         memflags = MEMF_node(node);
998 
999     if ( stack_base[cpu] == NULL )
1000     {
1001         stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
1002         if ( !stack_base[cpu] )
1003             goto out;
1004 
1005         memguard_guard_stack(stack_base[cpu]);
1006     }
1007 
1008     info = get_cpu_info_from_stack((unsigned long)stack_base[cpu]);
1009     info->processor_id = cpu;
1010     info->per_cpu_offset = __per_cpu_offset[cpu];
1011 
1012     gdt = per_cpu(gdt, cpu) ?: alloc_xenheap_pages(0, memflags);
1013     if ( gdt == NULL )
1014         goto out;
1015     per_cpu(gdt, cpu) = gdt;
1016     per_cpu(gdt_l1e, cpu) =
1017         l1e_from_pfn(virt_to_mfn(gdt), __PAGE_HYPERVISOR_RW);
1018     memcpy(gdt, boot_gdt, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
1019     BUILD_BUG_ON(NR_CPUS > 0x10000);
1020     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
1021 
1022 #ifdef CONFIG_PV32
1023     per_cpu(compat_gdt, cpu) = gdt = alloc_xenheap_pages(0, memflags);
1024     if ( gdt == NULL )
1025         goto out;
1026     per_cpu(compat_gdt_l1e, cpu) =
1027         l1e_from_pfn(virt_to_mfn(gdt), __PAGE_HYPERVISOR_RW);
1028     memcpy(gdt, boot_compat_gdt, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
1029     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
1030 #endif
1031 
1032     if ( idt_tables[cpu] == NULL )
1033         idt_tables[cpu] = alloc_xenheap_pages(0, memflags);
1034     if ( idt_tables[cpu] == NULL )
1035         goto out;
1036     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
1037     disable_each_ist(idt_tables[cpu]);
1038 
1039     for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
1040           i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
1041         if ( cpu_online(i) && cpu_to_node(i) == node )
1042         {
1043             per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
1044             break;
1045         }
1046     BUG_ON(i == cpu);
1047     stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
1048     if ( !stub_page )
1049         goto out;
1050     per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
1051 
1052     rc = setup_cpu_root_pgt(cpu);
1053     if ( rc )
1054         goto out;
1055     rc = -ENOMEM;
1056 
1057     if ( secondary_socket_cpumask == NULL &&
1058          (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
1059         goto out;
1060 
1061     if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
1062            cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
1063            cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) &&
1064            cond_alloc_cpumask_var(&per_cpu(send_ipi_cpumask, cpu))) )
1065         goto out;
1066 
1067     rc = 0;
1068 
1069  out:
1070     if ( rc )
1071         cpu_smpboot_free(cpu, true);
1072 
1073     return rc;
1074 }
1075 
cpu_smpboot_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1076 static int cpu_smpboot_callback(
1077     struct notifier_block *nfb, unsigned long action, void *hcpu)
1078 {
1079     unsigned int cpu = (unsigned long)hcpu;
1080     int rc = 0;
1081 
1082     switch ( action )
1083     {
1084     case CPU_UP_PREPARE:
1085         rc = cpu_smpboot_alloc(cpu);
1086         break;
1087     case CPU_UP_CANCELED:
1088     case CPU_DEAD:
1089         cpu_smpboot_free(cpu, !park_offline_cpus);
1090         break;
1091     case CPU_REMOVE:
1092         cpu_smpboot_free(cpu, true);
1093         break;
1094     }
1095 
1096     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1097 }
1098 
1099 static struct notifier_block cpu_smpboot_nfb = {
1100     .notifier_call = cpu_smpboot_callback
1101 };
1102 
smp_prepare_cpus(void)1103 void __init smp_prepare_cpus(void)
1104 {
1105     int rc;
1106 
1107     register_cpu_notifier(&cpu_smpboot_nfb);
1108 
1109     mtrr_aps_sync_begin();
1110 
1111     /* Setup boot CPU information */
1112     initialize_cpu_data(0); /* Final full version of the data */
1113     print_cpu_info(0);
1114 
1115     boot_cpu_physical_apicid = get_apic_id();
1116     x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
1117 
1118     stack_base[0] = (void *)((unsigned long)stack_start & ~(STACK_SIZE - 1));
1119 
1120     rc = setup_cpu_root_pgt(0);
1121     if ( rc )
1122         panic("Error %d setting up PV root page table\n", rc);
1123     if ( per_cpu(root_pgt, 0) )
1124     {
1125         get_cpu_info()->pv_cr3 = 0;
1126 
1127 #ifdef CONFIG_PV
1128         /*
1129          * All entry points which may need to switch page tables have to start
1130          * with interrupts off. Re-write what pv_trap_init() has put there.
1131          */
1132         _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3,
1133                   &int80_direct_trap);
1134 #endif
1135     }
1136 
1137     set_nr_sockets();
1138 
1139     socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
1140     if ( socket_cpumask == NULL ||
1141          (socket_cpumask[cpu_to_socket(0)] = xzalloc(cpumask_t)) == NULL )
1142         panic("No memory for socket CPU siblings map\n");
1143 
1144     if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) ||
1145          !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) )
1146         panic("No memory for boot CPU sibling/core maps\n");
1147 
1148     set_cpu_sibling_map(0);
1149 
1150     /*
1151      * If we couldn't find an SMP configuration at boot time,
1152      * get out of here now!
1153      */
1154     if ( !smp_found_config && !acpi_lapic )
1155     {
1156         printk(KERN_NOTICE "SMP motherboard not detected.\n");
1157     init_uniprocessor:
1158         physids_clear(phys_cpu_present_map);
1159         physid_set(0, phys_cpu_present_map);
1160         if (APIC_init_uniprocessor())
1161             printk(KERN_NOTICE "Local APIC not detected."
1162                    " Using dummy APIC emulation.\n");
1163         return;
1164     }
1165 
1166     /*
1167      * Should not be necessary because the MP table should list the boot
1168      * CPU too, but we do it for the sake of robustness anyway.
1169      * Makes no sense to do this check in clustered apic mode, so skip it
1170      */
1171     if ( !check_apicid_present(boot_cpu_physical_apicid) )
1172     {
1173         printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
1174                boot_cpu_physical_apicid);
1175         physid_set(get_apic_id(), phys_cpu_present_map);
1176     }
1177 
1178     /* If we couldn't find a local APIC, then get out of here now! */
1179     if ( !cpu_has_apic )
1180     {
1181         printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1182                boot_cpu_physical_apicid);
1183         goto init_uniprocessor;
1184     }
1185 
1186     verify_local_APIC();
1187 
1188     connect_bsp_APIC();
1189     setup_local_APIC(true);
1190 
1191     if ( !skip_ioapic_setup && nr_ioapics )
1192         setup_IO_APIC();
1193 
1194     setup_boot_APIC_clock();
1195 }
1196 
smp_prepare_boot_cpu(void)1197 void __init smp_prepare_boot_cpu(void)
1198 {
1199     unsigned int cpu = smp_processor_id();
1200 
1201     cpumask_set_cpu(cpu, &cpu_online_map);
1202     cpumask_set_cpu(cpu, &cpu_present_map);
1203 #if NR_CPUS > 2 * BITS_PER_LONG
1204     per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
1205     per_cpu(send_ipi_cpumask, cpu) = &send_ipi_cpu0mask;
1206 #endif
1207 
1208     get_cpu_info()->use_pv_cr3 = false;
1209     get_cpu_info()->xen_cr3 = 0;
1210     get_cpu_info()->pv_cr3 = 0;
1211 }
1212 
1213 static void
remove_siblinginfo(int cpu)1214 remove_siblinginfo(int cpu)
1215 {
1216     int sibling;
1217 
1218     cpumask_clear_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
1219 
1220     for_each_cpu ( sibling, per_cpu(cpu_core_mask, cpu) )
1221     {
1222         cpumask_clear_cpu(cpu, per_cpu(cpu_core_mask, sibling));
1223         /* Last thread sibling in this cpu core going down. */
1224         if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
1225             cpu_data[sibling].booted_cores--;
1226     }
1227 
1228     for_each_cpu(sibling, per_cpu(cpu_sibling_mask, cpu))
1229         cpumask_clear_cpu(cpu, per_cpu(cpu_sibling_mask, sibling));
1230     cpumask_clear(per_cpu(cpu_sibling_mask, cpu));
1231     cpumask_clear(per_cpu(cpu_core_mask, cpu));
1232 }
1233 
__cpu_disable(void)1234 void __cpu_disable(void)
1235 {
1236     int cpu = smp_processor_id();
1237 
1238     set_cpu_state(CPU_STATE_DYING);
1239 
1240     local_irq_disable();
1241     clear_local_APIC();
1242     /* Allow any queued timer interrupts to get serviced */
1243     local_irq_enable();
1244     mdelay(1);
1245     local_irq_disable();
1246 
1247     time_suspend();
1248 
1249     remove_siblinginfo(cpu);
1250 
1251     /* It's now safe to remove this processor from the online map */
1252     cpumask_clear_cpu(cpu, &cpu_online_map);
1253     fixup_irqs(&cpu_online_map, 1);
1254     fixup_eoi();
1255 }
1256 
__cpu_die(unsigned int cpu)1257 void __cpu_die(unsigned int cpu)
1258 {
1259     /* We don't do anything here: idle task is faking death itself. */
1260     unsigned int i = 0;
1261     enum cpu_state seen_state;
1262 
1263     while ( (seen_state = cpu_state) != CPU_STATE_DEAD )
1264     {
1265         BUG_ON(seen_state != CPU_STATE_DYING);
1266         mdelay(100);
1267         cpu_relax();
1268         process_pending_softirqs();
1269         if ( (++i % 10) == 0 )
1270             printk(KERN_ERR "CPU %u still not dead...\n", cpu);
1271     }
1272 }
1273 
cpu_add(uint32_t apic_id,uint32_t acpi_id,uint32_t pxm)1274 int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm)
1275 {
1276     int cpu = -1;
1277 
1278     dprintk(XENLOG_DEBUG, "cpu_add apic_id %x acpi_id %x pxm %x\n",
1279             apic_id, acpi_id, pxm);
1280 
1281     if ( (acpi_id >= MAX_MADT_ENTRIES) ||
1282          (apic_id >= MAX_APICS) ||
1283          (pxm >= 256) )
1284         return -EINVAL;
1285 
1286     cpu_hotplug_begin();
1287 
1288     /* Detect if the cpu has been added before */
1289     if ( x86_acpiid_to_apicid[acpi_id] != BAD_APICID )
1290     {
1291         cpu = (x86_acpiid_to_apicid[acpi_id] != apic_id)
1292             ? -EINVAL : -EEXIST;
1293         goto out;
1294     }
1295 
1296     if ( physid_isset(apic_id, phys_cpu_present_map) )
1297     {
1298         cpu = -EEXIST;
1299         goto out;
1300     }
1301 
1302     if ( (cpu = mp_register_lapic(apic_id, 1, 1)) < 0 )
1303         goto out;
1304 
1305     x86_acpiid_to_apicid[acpi_id] = apic_id;
1306 
1307     if ( !srat_disabled() )
1308     {
1309         nodeid_t node = setup_node(pxm);
1310 
1311         if ( node == NUMA_NO_NODE )
1312         {
1313             dprintk(XENLOG_WARNING,
1314                     "Setup node failed for pxm %x\n", pxm);
1315             x86_acpiid_to_apicid[acpi_id] = BAD_APICID;
1316             mp_unregister_lapic(apic_id, cpu);
1317             cpu = -ENOSPC;
1318             goto out;
1319         }
1320         if ( apic_id < MAX_LOCAL_APIC )
1321              apicid_to_node[apic_id] = node;
1322     }
1323 
1324     /* Physically added CPUs do not have synchronised TSC. */
1325     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1326     {
1327         printk_once(
1328             XENLOG_WARNING
1329             "New CPU %u may have skewed TSC and break cross-CPU TSC coherency\n"
1330             "Consider using \"tsc=skewed\" to force emulation where appropriate\n",
1331             cpu);
1332         cpumask_set_cpu(cpu, &tsc_sync_cpu_mask);
1333     }
1334 
1335     srat_detect_node(cpu);
1336     numa_add_cpu(cpu);
1337     dprintk(XENLOG_INFO, "Add CPU %x with index %x\n", apic_id, cpu);
1338  out:
1339     cpu_hotplug_done();
1340     return cpu;
1341 }
1342 
1343 
__cpu_up(unsigned int cpu)1344 int __cpu_up(unsigned int cpu)
1345 {
1346     int apicid, ret;
1347 
1348     if ( (apicid = x86_cpu_to_apicid[cpu]) == BAD_APICID )
1349         return -ENODEV;
1350 
1351     if ( (!x2apic_enabled && apicid >= APIC_ALL_CPUS) ||
1352          (iommu_intremap != iommu_intremap_full && (apicid >> 8)) )
1353     {
1354         printk("Unsupported: APIC ID %#x in xAPIC mode w/o interrupt remapping\n",
1355                apicid);
1356         return -EINVAL;
1357     }
1358 
1359     if ( (ret = do_boot_cpu(apicid, cpu)) != 0 )
1360         return ret;
1361 
1362     time_latch_stamps();
1363 
1364     set_cpu_state(CPU_STATE_ONLINE);
1365     while ( !cpu_online(cpu) )
1366     {
1367         cpu_relax();
1368         process_pending_softirqs();
1369     }
1370 
1371     return 0;
1372 }
1373 
1374 
smp_cpus_done(void)1375 void __init smp_cpus_done(void)
1376 {
1377     if ( nmi_watchdog == NMI_LOCAL_APIC )
1378     {
1379         setup_apic_nmi_watchdog();
1380         check_nmi_watchdog();
1381     }
1382 
1383     setup_ioapic_dest();
1384 
1385     mtrr_save_state();
1386     mtrr_aps_sync_end();
1387 }
1388 
smp_intr_init(void)1389 void __init smp_intr_init(void)
1390 {
1391     int irq, vector, seridx, cpu = smp_processor_id();
1392 
1393     /*
1394      * IRQ0 must be given a fixed assignment and initialized,
1395      * because it's used before the IO-APIC is set up.
1396      */
1397     irq_to_desc(0)->arch.vector = IRQ0_VECTOR;
1398 
1399     /*
1400      * Also ensure serial interrupts are high priority. We do not
1401      * want them to be blocked by unacknowledged guest-bound interrupts.
1402      */
1403     for ( seridx = 0; seridx <= SERHND_IDX; seridx++ )
1404     {
1405         if ( (irq = serial_irq(seridx)) < 0 )
1406             continue;
1407         vector = alloc_hipriority_vector();
1408         per_cpu(vector_irq, cpu)[vector] = irq;
1409         irq_to_desc(irq)->arch.vector = vector;
1410         cpumask_copy(irq_to_desc(irq)->arch.cpu_mask, &cpu_online_map);
1411     }
1412 
1413     /* Direct IPI vectors. */
1414     set_direct_apic_vector(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
1415     set_direct_apic_vector(EVENT_CHECK_VECTOR, event_check_interrupt);
1416     set_direct_apic_vector(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1417     set_direct_apic_vector(CALL_FUNCTION_VECTOR, call_function_interrupt);
1418 }
1419