1 /*
2 * x86 SMP booting functions
3 *
4 * This inherits a great deal from Linux's SMP boot code:
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include <xen/init.h>
23 #include <xen/kernel.h>
24 #include <xen/mm.h>
25 #include <xen/domain.h>
26 #include <xen/domain_page.h>
27 #include <xen/sched.h>
28 #include <xen/irq.h>
29 #include <xen/delay.h>
30 #include <xen/softirq.h>
31 #include <xen/tasklet.h>
32 #include <xen/serial.h>
33 #include <xen/numa.h>
34 #include <xen/cpu.h>
35 #include <asm/cpuidle.h>
36 #include <asm/current.h>
37 #include <asm/mc146818rtc.h>
38 #include <asm/desc.h>
39 #include <asm/div64.h>
40 #include <asm/flushtlb.h>
41 #include <asm/guest.h>
42 #include <asm/microcode.h>
43 #include <asm/msr.h>
44 #include <asm/mtrr.h>
45 #include <asm/spec_ctrl.h>
46 #include <asm/time.h>
47 #include <asm/tboot.h>
48 #include <irq_vectors.h>
49 #include <mach_apic.h>
50
51 unsigned long __read_mostly trampoline_phys;
52
53 /* representing HT siblings of each logical CPU */
54 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask);
55 /* representing HT and core siblings of each logical CPU */
56 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
57
58 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
59 static cpumask_t scratch_cpu0mask;
60
61 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, send_ipi_cpumask);
62 static cpumask_t send_ipi_cpu0mask;
63
64 cpumask_t cpu_online_map __read_mostly;
65 EXPORT_SYMBOL(cpu_online_map);
66
67 bool __read_mostly park_offline_cpus;
68
69 unsigned int __read_mostly nr_sockets;
70 cpumask_t **__read_mostly socket_cpumask;
71 static cpumask_t *secondary_socket_cpumask;
72
73 struct cpuinfo_x86 cpu_data[NR_CPUS];
74
75 u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
76 { [0 ... NR_CPUS-1] = BAD_APICID };
77
78 static int cpu_error;
79 static enum cpu_state {
80 CPU_STATE_DYING, /* slave -> master: I am dying */
81 CPU_STATE_DEAD, /* slave -> master: I am completely dead */
82 CPU_STATE_INIT, /* master -> slave: Early bringup phase 1 */
83 CPU_STATE_CALLOUT, /* master -> slave: Early bringup phase 2 */
84 CPU_STATE_CALLIN, /* slave -> master: Completed phase 2 */
85 CPU_STATE_ONLINE /* master -> slave: Go fully online now. */
86 } cpu_state;
87 #define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0)
88
89 void *stack_base[NR_CPUS];
90
initialize_cpu_data(unsigned int cpu)91 void initialize_cpu_data(unsigned int cpu)
92 {
93 cpu_data[cpu] = boot_cpu_data;
94 }
95
smp_store_cpu_info(unsigned int id)96 static bool smp_store_cpu_info(unsigned int id)
97 {
98 unsigned int socket;
99
100 if ( system_state != SYS_STATE_resume )
101 identify_cpu(&cpu_data[id]);
102 else if ( !recheck_cpu_features(id) )
103 return false;
104
105 socket = cpu_to_socket(id);
106 if ( !socket_cpumask[socket] )
107 {
108 socket_cpumask[socket] = secondary_socket_cpumask;
109 secondary_socket_cpumask = NULL;
110 }
111
112 return true;
113 }
114
115 /*
116 * TSC's upper 32 bits can't be written in earlier CPUs (before
117 * Prescott), there is no way to resync one AP against BP.
118 */
119 bool disable_tsc_sync;
120
121 static atomic_t tsc_count;
122 static uint64_t tsc_value;
123 static cpumask_t tsc_sync_cpu_mask;
124
synchronize_tsc_master(unsigned int slave)125 static void synchronize_tsc_master(unsigned int slave)
126 {
127 unsigned int i;
128
129 if ( disable_tsc_sync )
130 return;
131
132 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
133 !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
134 return;
135
136 for ( i = 1; i <= 5; i++ )
137 {
138 tsc_value = rdtsc_ordered();
139 smp_wmb();
140 atomic_inc(&tsc_count);
141 while ( atomic_read(&tsc_count) != (i<<1) )
142 cpu_relax();
143 }
144
145 atomic_set(&tsc_count, 0);
146 cpumask_clear_cpu(slave, &tsc_sync_cpu_mask);
147 }
148
synchronize_tsc_slave(unsigned int slave)149 static void synchronize_tsc_slave(unsigned int slave)
150 {
151 unsigned int i;
152
153 if ( disable_tsc_sync )
154 return;
155
156 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
157 !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
158 return;
159
160 for ( i = 1; i <= 5; i++ )
161 {
162 while ( atomic_read(&tsc_count) != ((i<<1)-1) )
163 cpu_relax();
164 smp_rmb();
165 /*
166 * If a CPU has been physically hotplugged, we may as well write
167 * to its TSC in spite of X86_FEATURE_TSC_RELIABLE. The platform does
168 * not sync up a new CPU's TSC for us.
169 */
170 __write_tsc(tsc_value);
171 atomic_inc(&tsc_count);
172 }
173 }
174
smp_callin(void)175 static void smp_callin(void)
176 {
177 unsigned int cpu = smp_processor_id();
178 int i, rc;
179
180 /* Wait 2s total for startup. */
181 Dprintk("Waiting for CALLOUT.\n");
182 for ( i = 0; cpu_state != CPU_STATE_CALLOUT; i++ )
183 {
184 BUG_ON(i >= 200);
185 cpu_relax();
186 mdelay(10);
187 }
188
189 /*
190 * The boot CPU has finished the init stage and is spinning on cpu_state
191 * update until we finish. We are free to set up this CPU: first the APIC.
192 */
193 Dprintk("CALLIN, before setup_local_APIC().\n");
194 x2apic_ap_setup();
195 setup_local_APIC(false);
196
197 /* Save our processor parameters. */
198 if ( !smp_store_cpu_info(cpu) )
199 {
200 printk("CPU%u: Failed to validate features - not coming back online\n",
201 cpu);
202 cpu_error = -ENXIO;
203 goto halt;
204 }
205
206 if ( cpu_has_hypervisor && (rc = hypervisor_ap_setup()) != 0 )
207 {
208 printk("CPU%d: Failed to initialise hypervisor functions. Not coming online.\n", cpu);
209 cpu_error = rc;
210 goto halt;
211 }
212
213 if ( (rc = hvm_cpu_up()) != 0 )
214 {
215 printk("CPU%d: Failed to initialise HVM. Not coming online.\n", cpu);
216 cpu_error = rc;
217 halt:
218 clear_local_APIC();
219 spin_debug_enable();
220 play_dead();
221 }
222
223 /* Allow the master to continue. */
224 set_cpu_state(CPU_STATE_CALLIN);
225
226 synchronize_tsc_slave(cpu);
227
228 /* And wait for our final Ack. */
229 while ( cpu_state != CPU_STATE_ONLINE )
230 cpu_relax();
231 }
232
233 static int booting_cpu;
234
235 /* CPUs for which sibling maps can be computed. */
236 static cpumask_t cpu_sibling_setup_map;
237
link_thread_siblings(int cpu1,int cpu2)238 static void link_thread_siblings(int cpu1, int cpu2)
239 {
240 cpumask_set_cpu(cpu1, per_cpu(cpu_sibling_mask, cpu2));
241 cpumask_set_cpu(cpu2, per_cpu(cpu_sibling_mask, cpu1));
242 cpumask_set_cpu(cpu1, per_cpu(cpu_core_mask, cpu2));
243 cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
244 }
245
set_cpu_sibling_map(unsigned int cpu)246 static void set_cpu_sibling_map(unsigned int cpu)
247 {
248 unsigned int i;
249 struct cpuinfo_x86 *c = cpu_data;
250
251 cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
252
253 cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
254 cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu));
255 cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
256
257 if ( c[cpu].x86_num_siblings > 1 )
258 {
259 for_each_cpu ( i, &cpu_sibling_setup_map )
260 {
261 if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id )
262 continue;
263 if ( c[cpu].compute_unit_id != INVALID_CUID &&
264 c[i].compute_unit_id != INVALID_CUID )
265 {
266 if ( c[cpu].compute_unit_id == c[i].compute_unit_id )
267 link_thread_siblings(cpu, i);
268 }
269 else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID &&
270 c[i].cpu_core_id != XEN_INVALID_CORE_ID )
271 {
272 if ( c[cpu].cpu_core_id == c[i].cpu_core_id )
273 link_thread_siblings(cpu, i);
274 }
275 else
276 printk(XENLOG_WARNING
277 "CPU%u: unclear relationship with CPU%u\n",
278 cpu, i);
279 }
280 }
281
282 if ( c[cpu].x86_max_cores == 1 )
283 {
284 cpumask_copy(per_cpu(cpu_core_mask, cpu),
285 per_cpu(cpu_sibling_mask, cpu));
286 c[cpu].booted_cores = 1;
287 return;
288 }
289
290 for_each_cpu ( i, &cpu_sibling_setup_map )
291 {
292 if ( c[cpu].phys_proc_id == c[i].phys_proc_id )
293 {
294 cpumask_set_cpu(i, per_cpu(cpu_core_mask, cpu));
295 cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, i));
296 /*
297 * Does this new cpu bringup a new core?
298 */
299 if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
300 {
301 /*
302 * for each core in package, increment
303 * the booted_cores for this new cpu
304 */
305 if ( cpumask_first(per_cpu(cpu_sibling_mask, i)) == i )
306 c[cpu].booted_cores++;
307 /*
308 * increment the core count for all
309 * the other cpus in this package
310 */
311 if ( i != cpu )
312 c[i].booted_cores++;
313 }
314 else if ( (i != cpu) && !c[cpu].booted_cores )
315 {
316 c[cpu].booted_cores = c[i].booted_cores;
317 }
318 }
319 }
320 }
321
start_secondary(void * unused)322 void start_secondary(void *unused)
323 {
324 /*
325 * Dont put anything before smp_callin(), SMP booting is so fragile that we
326 * want to limit the things done here to the most necessary things.
327 */
328 unsigned int cpu = booting_cpu;
329
330 /* Critical region without IDT or TSS. Any fault is deadly! */
331
332 set_current(idle_vcpu[cpu]);
333 this_cpu(curr_vcpu) = idle_vcpu[cpu];
334 rdmsrl(MSR_EFER, this_cpu(efer));
335 init_shadow_spec_ctrl_state();
336
337 /*
338 * Just as during early bootstrap, it is convenient here to disable
339 * spinlock checking while we have IRQs disabled. This allows us to
340 * acquire IRQ-unsafe locks when it would otherwise be disallowed.
341 *
342 * It is safe because the race we are usually trying to avoid involves
343 * a group of CPUs rendezvousing in an IPI handler, where one cannot
344 * join because it is spinning with IRQs disabled waiting to acquire a
345 * lock held by another in the rendezvous group (the lock must be an
346 * IRQ-unsafe lock since the CPU took the IPI after acquiring it, and
347 * hence had IRQs enabled). This is a deadlock scenario.
348 *
349 * However, no CPU can be involved in rendezvous until it is online,
350 * hence no such group can be waiting for this CPU until it is
351 * visible in cpu_online_map. Hence such a deadlock is not possible.
352 */
353 spin_debug_disable();
354
355 get_cpu_info()->use_pv_cr3 = false;
356 get_cpu_info()->xen_cr3 = 0;
357 get_cpu_info()->pv_cr3 = 0;
358
359 load_system_tables();
360
361 /* Full exception support from here on in. */
362
363 /* Safe to enable feature such as CR4.MCE with the IDT set up now. */
364 write_cr4(mmu_cr4_features);
365
366 percpu_traps_init();
367
368 cpu_init();
369
370 initialize_cpu_data(cpu);
371
372 microcode_update_one();
373
374 /*
375 * If any speculative control MSRs are available, apply Xen's default
376 * settings. Note: These MSRs may only become available after loading
377 * microcode.
378 */
379 if ( boot_cpu_has(X86_FEATURE_IBRSB) )
380 wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
381 if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )
382 wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl);
383
384 tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
385
386 smp_callin();
387
388 set_cpu_sibling_map(cpu);
389
390 init_percpu_time();
391
392 setup_secondary_APIC_clock();
393
394 /*
395 * low-memory mappings have been cleared, flush them from
396 * the local TLBs too.
397 */
398 flush_tlb_local();
399
400 /* This must be done before setting cpu_online_map */
401 spin_debug_enable();
402 notify_cpu_starting(cpu);
403
404 /*
405 * We need to hold vector_lock so there the set of online cpus
406 * does not change while we are assigning vectors to cpus. Holding
407 * this lock ensures we don't half assign or remove an irq from a cpu.
408 */
409 lock_vector_lock();
410 setup_vector_irq(cpu);
411 cpumask_set_cpu(cpu, &cpu_online_map);
412 unlock_vector_lock();
413
414 /* We can take interrupts now: we're officially "up". */
415 local_irq_enable();
416 mtrr_ap_init();
417
418 startup_cpu_idle_loop();
419 }
420
421 extern void *stack_start;
422
wakeup_secondary_cpu(int phys_apicid,unsigned long start_eip)423 static int wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
424 {
425 unsigned long send_status = 0, accept_status = 0;
426 int maxlvt, timeout, i;
427
428 /*
429 * Be paranoid about clearing APIC errors.
430 */
431 apic_write(APIC_ESR, 0);
432 apic_read(APIC_ESR);
433
434 Dprintk("Asserting INIT.\n");
435
436 /*
437 * Turn INIT on target chip via IPI
438 */
439 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
440 phys_apicid);
441
442 if ( !x2apic_enabled )
443 {
444 Dprintk("Waiting for send to finish...\n");
445 timeout = 0;
446 do {
447 Dprintk("+");
448 udelay(100);
449 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
450 } while ( send_status && (timeout++ < 1000) );
451
452 mdelay(10);
453
454 Dprintk("Deasserting INIT.\n");
455
456 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
457
458 Dprintk("Waiting for send to finish...\n");
459 timeout = 0;
460 do {
461 Dprintk("+");
462 udelay(100);
463 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
464 } while ( send_status && (timeout++ < 1000) );
465 }
466 else if ( tboot_in_measured_env() )
467 {
468 /*
469 * With tboot AP is actually spinning in a mini-guest before
470 * receiving INIT. Upon receiving INIT ipi, AP need time to VMExit,
471 * update VMCS to tracking SIPIs and VMResume.
472 *
473 * While AP is in root mode handling the INIT the CPU will drop
474 * any SIPIs
475 */
476 udelay(10);
477 }
478
479 maxlvt = get_maxlvt();
480
481 for ( i = 0; i < 2; i++ )
482 {
483 Dprintk("Sending STARTUP #%d.\n", i+1);
484 apic_write(APIC_ESR, 0);
485 apic_read(APIC_ESR);
486 Dprintk("After apic_write.\n");
487
488 /*
489 * STARTUP IPI
490 * Boot on the stack
491 */
492 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
493
494 if ( !x2apic_enabled )
495 {
496 /* Give the other CPU some time to accept the IPI. */
497 udelay(300);
498
499 Dprintk("Startup point 1.\n");
500
501 Dprintk("Waiting for send to finish...\n");
502 timeout = 0;
503 do {
504 Dprintk("+");
505 udelay(100);
506 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
507 } while ( send_status && (timeout++ < 1000) );
508
509 /* Give the other CPU some time to accept the IPI. */
510 udelay(200);
511 }
512
513 /* Due to the Pentium erratum 3AP. */
514 if ( maxlvt > 3 )
515 {
516 apic_write(APIC_ESR, 0);
517 }
518 accept_status = (apic_read(APIC_ESR) & 0xEF);
519 if ( send_status || accept_status )
520 break;
521 }
522 Dprintk("After Startup.\n");
523
524 if ( send_status )
525 printk("APIC never delivered???\n");
526 if ( accept_status )
527 printk("APIC delivery error (%lx).\n", accept_status);
528
529 return (send_status | accept_status);
530 }
531
alloc_cpu_id(void)532 int alloc_cpu_id(void)
533 {
534 cpumask_t tmp_map;
535 int cpu;
536
537 cpumask_complement(&tmp_map, &cpu_present_map);
538 cpu = cpumask_first(&tmp_map);
539 return (cpu < nr_cpu_ids) ? cpu : -ENODEV;
540 }
541
do_boot_cpu(int apicid,int cpu)542 static int do_boot_cpu(int apicid, int cpu)
543 {
544 int timeout, boot_error = 0, rc = 0;
545 unsigned long start_eip;
546
547 /*
548 * Save current MTRR state in case it was changed since early boot
549 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
550 */
551 mtrr_save_state();
552
553 booting_cpu = cpu;
554
555 start_eip = bootsym_phys(trampoline_realmode_entry);
556
557 /* start_eip needs be page aligned, and below the 1M boundary. */
558 if ( start_eip & ~0xff000 )
559 panic("AP trampoline %#lx not suitably positioned\n", start_eip);
560
561 /* So we see what's up */
562 if ( opt_cpu_info )
563 printk("Booting processor %d/%d eip %lx\n",
564 cpu, apicid, start_eip);
565
566 stack_start = stack_base[cpu] + STACK_SIZE - sizeof(struct cpu_info);
567
568 /* This grunge runs the startup process for the targeted processor. */
569
570 set_cpu_state(CPU_STATE_INIT);
571
572 /* Starting actual IPI sequence... */
573 if ( !tboot_in_measured_env() || tboot_wake_ap(apicid, start_eip) )
574 boot_error = wakeup_secondary_cpu(apicid, start_eip);
575
576 if ( !boot_error )
577 {
578 /* Allow AP to start initializing. */
579 set_cpu_state(CPU_STATE_CALLOUT);
580 Dprintk("After Callout %d.\n", cpu);
581
582 /* Wait 5s total for a response. */
583 for ( timeout = 0; timeout < 50000; timeout++ )
584 {
585 if ( cpu_state != CPU_STATE_CALLOUT )
586 break;
587 udelay(100);
588 }
589
590 if ( cpu_state == CPU_STATE_CALLIN )
591 {
592 /* number CPUs logically, starting from 1 (BSP is 0) */
593 Dprintk("OK.\n");
594 print_cpu_info(cpu);
595 synchronize_tsc_master(cpu);
596 Dprintk("CPU has booted.\n");
597 }
598 else if ( cpu_state == CPU_STATE_DEAD )
599 {
600 smp_rmb();
601 rc = cpu_error;
602 }
603 else
604 {
605 boot_error = 1;
606 smp_mb();
607 if ( bootsym(trampoline_cpu_started) == 0xA5 )
608 /* trampoline started but...? */
609 printk("Stuck ??\n");
610 else
611 /* trampoline code not run */
612 printk("Not responding.\n");
613 }
614 }
615
616 if ( boot_error )
617 {
618 cpu_exit_clear(cpu);
619 rc = -EIO;
620 }
621
622 /* mark "stuck" area as not stuck */
623 bootsym(trampoline_cpu_started) = 0;
624 smp_mb();
625
626 return rc;
627 }
628
629 #define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
630
alloc_stub_page(unsigned int cpu,unsigned long * mfn)631 unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
632 {
633 unsigned long stub_va;
634 struct page_info *pg;
635
636 BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
637
638 if ( *mfn )
639 pg = mfn_to_page(_mfn(*mfn));
640 else
641 {
642 nodeid_t node = cpu_to_node(cpu);
643 unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
644
645 pg = alloc_domheap_page(NULL, memflags);
646 if ( !pg )
647 return 0;
648
649 unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
650 }
651
652 stub_va = XEN_VIRT_END - FIXADDR_X_SIZE - (cpu + 1) * PAGE_SIZE;
653 if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
654 PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
655 {
656 if ( !*mfn )
657 free_domheap_page(pg);
658 stub_va = 0;
659 }
660 else if ( !*mfn )
661 *mfn = mfn_x(page_to_mfn(pg));
662
663 return stub_va;
664 }
665
cpu_exit_clear(unsigned int cpu)666 void cpu_exit_clear(unsigned int cpu)
667 {
668 cpu_uninit(cpu);
669 set_cpu_state(CPU_STATE_DEAD);
670 }
671
clone_mapping(const void * ptr,root_pgentry_t * rpt)672 static int clone_mapping(const void *ptr, root_pgentry_t *rpt)
673 {
674 unsigned long linear = (unsigned long)ptr, pfn;
675 unsigned int flags;
676 l3_pgentry_t *pl3e;
677 l2_pgentry_t *pl2e;
678 l1_pgentry_t *pl1e;
679
680 /*
681 * Sanity check 'linear'. We only allow cloning from the Xen virtual
682 * range, and in particular, only from the directmap and .text ranges.
683 */
684 if ( root_table_offset(linear) > ROOT_PAGETABLE_LAST_XEN_SLOT ||
685 root_table_offset(linear) < ROOT_PAGETABLE_FIRST_XEN_SLOT )
686 return -EINVAL;
687
688 if ( linear < XEN_VIRT_START ||
689 (linear >= XEN_VIRT_END && linear < DIRECTMAP_VIRT_START) )
690 return -EINVAL;
691
692 pl3e = l4e_to_l3e(idle_pg_table[root_table_offset(linear)]) +
693 l3_table_offset(linear);
694
695 flags = l3e_get_flags(*pl3e);
696 ASSERT(flags & _PAGE_PRESENT);
697 if ( flags & _PAGE_PSE )
698 {
699 pfn = (l3e_get_pfn(*pl3e) & ~((1UL << (2 * PAGETABLE_ORDER)) - 1)) |
700 (PFN_DOWN(linear) & ((1UL << (2 * PAGETABLE_ORDER)) - 1));
701 flags &= ~_PAGE_PSE;
702 }
703 else
704 {
705 pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(linear);
706 flags = l2e_get_flags(*pl2e);
707 ASSERT(flags & _PAGE_PRESENT);
708 if ( flags & _PAGE_PSE )
709 {
710 pfn = (l2e_get_pfn(*pl2e) & ~((1UL << PAGETABLE_ORDER) - 1)) |
711 (PFN_DOWN(linear) & ((1UL << PAGETABLE_ORDER) - 1));
712 flags &= ~_PAGE_PSE;
713 }
714 else
715 {
716 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(linear);
717 flags = l1e_get_flags(*pl1e);
718 if ( !(flags & _PAGE_PRESENT) )
719 return 0;
720 pfn = l1e_get_pfn(*pl1e);
721 }
722 }
723
724 if ( !(root_get_flags(rpt[root_table_offset(linear)]) & _PAGE_PRESENT) )
725 {
726 pl3e = alloc_xen_pagetable();
727 if ( !pl3e )
728 return -ENOMEM;
729 clear_page(pl3e);
730 l4e_write(&rpt[root_table_offset(linear)],
731 l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
732 }
733 else
734 pl3e = l4e_to_l3e(rpt[root_table_offset(linear)]);
735
736 pl3e += l3_table_offset(linear);
737
738 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
739 {
740 pl2e = alloc_xen_pagetable();
741 if ( !pl2e )
742 return -ENOMEM;
743 clear_page(pl2e);
744 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
745 }
746 else
747 {
748 ASSERT(!(l3e_get_flags(*pl3e) & _PAGE_PSE));
749 pl2e = l3e_to_l2e(*pl3e);
750 }
751
752 pl2e += l2_table_offset(linear);
753
754 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
755 {
756 pl1e = alloc_xen_pagetable();
757 if ( !pl1e )
758 return -ENOMEM;
759 clear_page(pl1e);
760 l2e_write(pl2e, l2e_from_paddr(__pa(pl1e), __PAGE_HYPERVISOR));
761 }
762 else
763 {
764 ASSERT(!(l2e_get_flags(*pl2e) & _PAGE_PSE));
765 pl1e = l2e_to_l1e(*pl2e);
766 }
767
768 pl1e += l1_table_offset(linear);
769 flags &= ~_PAGE_GLOBAL;
770
771 if ( l1e_get_flags(*pl1e) & _PAGE_PRESENT )
772 {
773 ASSERT(l1e_get_pfn(*pl1e) == pfn);
774 ASSERT(l1e_get_flags(*pl1e) == flags);
775 }
776 else
777 l1e_write(pl1e, l1e_from_pfn(pfn, flags));
778
779 return 0;
780 }
781
782 DEFINE_PER_CPU(root_pgentry_t *, root_pgt);
783
784 static root_pgentry_t common_pgt;
785
786 extern const char _stextentry[], _etextentry[];
787
setup_cpu_root_pgt(unsigned int cpu)788 static int setup_cpu_root_pgt(unsigned int cpu)
789 {
790 root_pgentry_t *rpt;
791 unsigned int off;
792 int rc;
793
794 if ( !opt_xpti_hwdom && !opt_xpti_domu )
795 return 0;
796
797 rpt = alloc_xen_pagetable();
798 if ( !rpt )
799 return -ENOMEM;
800
801 clear_page(rpt);
802 per_cpu(root_pgt, cpu) = rpt;
803
804 rpt[root_table_offset(RO_MPT_VIRT_START)] =
805 idle_pg_table[root_table_offset(RO_MPT_VIRT_START)];
806 /* SH_LINEAR_PT inserted together with guest mappings. */
807 /* PERDOMAIN inserted during context switch. */
808
809 /* One-time setup of common_pgt, which maps .text.entry and the stubs. */
810 if ( unlikely(!root_get_intpte(common_pgt)) )
811 {
812 const char *ptr;
813
814 for ( rc = 0, ptr = _stextentry;
815 !rc && ptr < _etextentry; ptr += PAGE_SIZE )
816 rc = clone_mapping(ptr, rpt);
817
818 if ( rc )
819 return rc;
820
821 common_pgt = rpt[root_table_offset(XEN_VIRT_START)];
822 }
823
824 rpt[root_table_offset(XEN_VIRT_START)] = common_pgt;
825
826 /* Install direct map page table entries for stack, IDT, and TSS. */
827 for ( off = rc = 0; !rc && off < STACK_SIZE; off += PAGE_SIZE )
828 rc = clone_mapping(__va(__pa(stack_base[cpu])) + off, rpt);
829
830 if ( !rc )
831 rc = clone_mapping(idt_tables[cpu], rpt);
832 if ( !rc )
833 {
834 struct tss_page *ptr = &per_cpu(tss_page, cpu);
835
836 BUILD_BUG_ON(sizeof(*ptr) != PAGE_SIZE);
837
838 rc = clone_mapping(&ptr->tss, rpt);
839 }
840 if ( !rc )
841 rc = clone_mapping((void *)per_cpu(stubs.addr, cpu), rpt);
842
843 return rc;
844 }
845
cleanup_cpu_root_pgt(unsigned int cpu)846 static void cleanup_cpu_root_pgt(unsigned int cpu)
847 {
848 root_pgentry_t *rpt = per_cpu(root_pgt, cpu);
849 unsigned int r;
850 unsigned long stub_linear = per_cpu(stubs.addr, cpu);
851
852 if ( !rpt )
853 return;
854
855 per_cpu(root_pgt, cpu) = NULL;
856
857 for ( r = root_table_offset(DIRECTMAP_VIRT_START);
858 r < root_table_offset(HYPERVISOR_VIRT_END); ++r )
859 {
860 l3_pgentry_t *l3t;
861 mfn_t l3mfn;
862 unsigned int i3;
863
864 if ( !(root_get_flags(rpt[r]) & _PAGE_PRESENT) )
865 continue;
866
867 l3mfn = l4e_get_mfn(rpt[r]);
868 l3t = map_domain_page(l3mfn);
869
870 for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; ++i3 )
871 {
872 l2_pgentry_t *l2t;
873 mfn_t l2mfn;
874 unsigned int i2;
875
876 if ( !(l3e_get_flags(l3t[i3]) & _PAGE_PRESENT) )
877 continue;
878
879 ASSERT(!(l3e_get_flags(l3t[i3]) & _PAGE_PSE));
880 l2mfn = l3e_get_mfn(l3t[i3]);
881 l2t = map_domain_page(l2mfn);
882
883 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; ++i2 )
884 {
885 if ( !(l2e_get_flags(l2t[i2]) & _PAGE_PRESENT) )
886 continue;
887
888 ASSERT(!(l2e_get_flags(l2t[i2]) & _PAGE_PSE));
889 free_xen_pagetable_new(l2e_get_mfn(l2t[i2]));
890 }
891
892 unmap_domain_page(l2t);
893 free_xen_pagetable_new(l2mfn);
894 }
895
896 unmap_domain_page(l3t);
897 free_xen_pagetable_new(l3mfn);
898 }
899
900 free_xen_pagetable(rpt);
901
902 /* Also zap the stub mapping for this CPU. */
903 if ( stub_linear )
904 {
905 l3_pgentry_t l3e = l3e_from_l4e(common_pgt,
906 l3_table_offset(stub_linear));
907 l2_pgentry_t l2e = l2e_from_l3e(l3e, l2_table_offset(stub_linear));
908 l1_pgentry_t *l1t = map_l1t_from_l2e(l2e);
909
910 l1t[l1_table_offset(stub_linear)] = l1e_empty();
911
912 unmap_domain_page(l1t);
913 }
914 }
915
916 /*
917 * The 'remove' boolean controls whether a CPU is just getting offlined (and
918 * parked), or outright removed / offlined without parking. Parked CPUs need
919 * things like their stack, GDT, IDT, TSS, and per-CPU data still available.
920 * A few other items, in particular CPU masks, are also retained, as it's
921 * difficult to prove that they're entirely unreferenced from parked CPUs.
922 */
cpu_smpboot_free(unsigned int cpu,bool remove)923 static void cpu_smpboot_free(unsigned int cpu, bool remove)
924 {
925 unsigned int socket = cpu_to_socket(cpu);
926 struct cpuinfo_x86 *c = cpu_data;
927
928 if ( cpumask_empty(socket_cpumask[socket]) )
929 {
930 xfree(socket_cpumask[socket]);
931 socket_cpumask[socket] = NULL;
932 }
933
934 cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
935
936 if ( remove )
937 {
938 c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
939 c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
940 c[cpu].compute_unit_id = INVALID_CUID;
941
942 FREE_CPUMASK_VAR(per_cpu(cpu_sibling_mask, cpu));
943 FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu));
944 if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
945 FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu));
946 if ( per_cpu(send_ipi_cpumask, cpu) != &send_ipi_cpu0mask )
947 FREE_CPUMASK_VAR(per_cpu(send_ipi_cpumask, cpu));
948 }
949
950 cleanup_cpu_root_pgt(cpu);
951
952 if ( per_cpu(stubs.addr, cpu) )
953 {
954 mfn_t mfn = _mfn(per_cpu(stubs.mfn, cpu));
955 unsigned char *stub_page = map_domain_page(mfn);
956 unsigned int i;
957
958 memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
959 for ( i = 0; i < STUBS_PER_PAGE; ++i )
960 if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
961 break;
962 unmap_domain_page(stub_page);
963 destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
964 (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
965 per_cpu(stubs.addr, cpu) = 0;
966 per_cpu(stubs.mfn, cpu) = 0;
967 if ( i == STUBS_PER_PAGE )
968 free_domheap_page(mfn_to_page(mfn));
969 }
970
971 if ( IS_ENABLED(CONFIG_PV32) )
972 FREE_XENHEAP_PAGE(per_cpu(compat_gdt, cpu));
973
974 if ( remove )
975 {
976 FREE_XENHEAP_PAGE(per_cpu(gdt, cpu));
977 FREE_XENHEAP_PAGE(idt_tables[cpu]);
978
979 if ( stack_base[cpu] )
980 {
981 memguard_unguard_stack(stack_base[cpu]);
982 FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER);
983 }
984 }
985 }
986
cpu_smpboot_alloc(unsigned int cpu)987 static int cpu_smpboot_alloc(unsigned int cpu)
988 {
989 struct cpu_info *info;
990 unsigned int i, memflags = 0;
991 nodeid_t node = cpu_to_node(cpu);
992 seg_desc_t *gdt;
993 unsigned long stub_page;
994 int rc = -ENOMEM;
995
996 if ( node != NUMA_NO_NODE )
997 memflags = MEMF_node(node);
998
999 if ( stack_base[cpu] == NULL )
1000 {
1001 stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
1002 if ( !stack_base[cpu] )
1003 goto out;
1004
1005 memguard_guard_stack(stack_base[cpu]);
1006 }
1007
1008 info = get_cpu_info_from_stack((unsigned long)stack_base[cpu]);
1009 info->processor_id = cpu;
1010 info->per_cpu_offset = __per_cpu_offset[cpu];
1011
1012 gdt = per_cpu(gdt, cpu) ?: alloc_xenheap_pages(0, memflags);
1013 if ( gdt == NULL )
1014 goto out;
1015 per_cpu(gdt, cpu) = gdt;
1016 per_cpu(gdt_l1e, cpu) =
1017 l1e_from_pfn(virt_to_mfn(gdt), __PAGE_HYPERVISOR_RW);
1018 memcpy(gdt, boot_gdt, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
1019 BUILD_BUG_ON(NR_CPUS > 0x10000);
1020 gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
1021
1022 #ifdef CONFIG_PV32
1023 per_cpu(compat_gdt, cpu) = gdt = alloc_xenheap_pages(0, memflags);
1024 if ( gdt == NULL )
1025 goto out;
1026 per_cpu(compat_gdt_l1e, cpu) =
1027 l1e_from_pfn(virt_to_mfn(gdt), __PAGE_HYPERVISOR_RW);
1028 memcpy(gdt, boot_compat_gdt, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
1029 gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
1030 #endif
1031
1032 if ( idt_tables[cpu] == NULL )
1033 idt_tables[cpu] = alloc_xenheap_pages(0, memflags);
1034 if ( idt_tables[cpu] == NULL )
1035 goto out;
1036 memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
1037 disable_each_ist(idt_tables[cpu]);
1038
1039 for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
1040 i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
1041 if ( cpu_online(i) && cpu_to_node(i) == node )
1042 {
1043 per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
1044 break;
1045 }
1046 BUG_ON(i == cpu);
1047 stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
1048 if ( !stub_page )
1049 goto out;
1050 per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
1051
1052 rc = setup_cpu_root_pgt(cpu);
1053 if ( rc )
1054 goto out;
1055 rc = -ENOMEM;
1056
1057 if ( secondary_socket_cpumask == NULL &&
1058 (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
1059 goto out;
1060
1061 if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
1062 cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
1063 cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) &&
1064 cond_alloc_cpumask_var(&per_cpu(send_ipi_cpumask, cpu))) )
1065 goto out;
1066
1067 rc = 0;
1068
1069 out:
1070 if ( rc )
1071 cpu_smpboot_free(cpu, true);
1072
1073 return rc;
1074 }
1075
cpu_smpboot_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1076 static int cpu_smpboot_callback(
1077 struct notifier_block *nfb, unsigned long action, void *hcpu)
1078 {
1079 unsigned int cpu = (unsigned long)hcpu;
1080 int rc = 0;
1081
1082 switch ( action )
1083 {
1084 case CPU_UP_PREPARE:
1085 rc = cpu_smpboot_alloc(cpu);
1086 break;
1087 case CPU_UP_CANCELED:
1088 case CPU_DEAD:
1089 cpu_smpboot_free(cpu, !park_offline_cpus);
1090 break;
1091 case CPU_REMOVE:
1092 cpu_smpboot_free(cpu, true);
1093 break;
1094 }
1095
1096 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1097 }
1098
1099 static struct notifier_block cpu_smpboot_nfb = {
1100 .notifier_call = cpu_smpboot_callback
1101 };
1102
smp_prepare_cpus(void)1103 void __init smp_prepare_cpus(void)
1104 {
1105 int rc;
1106
1107 register_cpu_notifier(&cpu_smpboot_nfb);
1108
1109 mtrr_aps_sync_begin();
1110
1111 /* Setup boot CPU information */
1112 initialize_cpu_data(0); /* Final full version of the data */
1113 print_cpu_info(0);
1114
1115 boot_cpu_physical_apicid = get_apic_id();
1116 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
1117
1118 stack_base[0] = (void *)((unsigned long)stack_start & ~(STACK_SIZE - 1));
1119
1120 rc = setup_cpu_root_pgt(0);
1121 if ( rc )
1122 panic("Error %d setting up PV root page table\n", rc);
1123 if ( per_cpu(root_pgt, 0) )
1124 {
1125 get_cpu_info()->pv_cr3 = 0;
1126
1127 #ifdef CONFIG_PV
1128 /*
1129 * All entry points which may need to switch page tables have to start
1130 * with interrupts off. Re-write what pv_trap_init() has put there.
1131 */
1132 _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3,
1133 &int80_direct_trap);
1134 #endif
1135 }
1136
1137 set_nr_sockets();
1138
1139 socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
1140 if ( socket_cpumask == NULL ||
1141 (socket_cpumask[cpu_to_socket(0)] = xzalloc(cpumask_t)) == NULL )
1142 panic("No memory for socket CPU siblings map\n");
1143
1144 if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) ||
1145 !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) )
1146 panic("No memory for boot CPU sibling/core maps\n");
1147
1148 set_cpu_sibling_map(0);
1149
1150 /*
1151 * If we couldn't find an SMP configuration at boot time,
1152 * get out of here now!
1153 */
1154 if ( !smp_found_config && !acpi_lapic )
1155 {
1156 printk(KERN_NOTICE "SMP motherboard not detected.\n");
1157 init_uniprocessor:
1158 physids_clear(phys_cpu_present_map);
1159 physid_set(0, phys_cpu_present_map);
1160 if (APIC_init_uniprocessor())
1161 printk(KERN_NOTICE "Local APIC not detected."
1162 " Using dummy APIC emulation.\n");
1163 return;
1164 }
1165
1166 /*
1167 * Should not be necessary because the MP table should list the boot
1168 * CPU too, but we do it for the sake of robustness anyway.
1169 * Makes no sense to do this check in clustered apic mode, so skip it
1170 */
1171 if ( !check_apicid_present(boot_cpu_physical_apicid) )
1172 {
1173 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
1174 boot_cpu_physical_apicid);
1175 physid_set(get_apic_id(), phys_cpu_present_map);
1176 }
1177
1178 /* If we couldn't find a local APIC, then get out of here now! */
1179 if ( !cpu_has_apic )
1180 {
1181 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1182 boot_cpu_physical_apicid);
1183 goto init_uniprocessor;
1184 }
1185
1186 verify_local_APIC();
1187
1188 connect_bsp_APIC();
1189 setup_local_APIC(true);
1190
1191 if ( !skip_ioapic_setup && nr_ioapics )
1192 setup_IO_APIC();
1193
1194 setup_boot_APIC_clock();
1195 }
1196
smp_prepare_boot_cpu(void)1197 void __init smp_prepare_boot_cpu(void)
1198 {
1199 unsigned int cpu = smp_processor_id();
1200
1201 cpumask_set_cpu(cpu, &cpu_online_map);
1202 cpumask_set_cpu(cpu, &cpu_present_map);
1203 #if NR_CPUS > 2 * BITS_PER_LONG
1204 per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
1205 per_cpu(send_ipi_cpumask, cpu) = &send_ipi_cpu0mask;
1206 #endif
1207
1208 get_cpu_info()->use_pv_cr3 = false;
1209 get_cpu_info()->xen_cr3 = 0;
1210 get_cpu_info()->pv_cr3 = 0;
1211 }
1212
1213 static void
remove_siblinginfo(int cpu)1214 remove_siblinginfo(int cpu)
1215 {
1216 int sibling;
1217
1218 cpumask_clear_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
1219
1220 for_each_cpu ( sibling, per_cpu(cpu_core_mask, cpu) )
1221 {
1222 cpumask_clear_cpu(cpu, per_cpu(cpu_core_mask, sibling));
1223 /* Last thread sibling in this cpu core going down. */
1224 if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
1225 cpu_data[sibling].booted_cores--;
1226 }
1227
1228 for_each_cpu(sibling, per_cpu(cpu_sibling_mask, cpu))
1229 cpumask_clear_cpu(cpu, per_cpu(cpu_sibling_mask, sibling));
1230 cpumask_clear(per_cpu(cpu_sibling_mask, cpu));
1231 cpumask_clear(per_cpu(cpu_core_mask, cpu));
1232 }
1233
__cpu_disable(void)1234 void __cpu_disable(void)
1235 {
1236 int cpu = smp_processor_id();
1237
1238 set_cpu_state(CPU_STATE_DYING);
1239
1240 local_irq_disable();
1241 clear_local_APIC();
1242 /* Allow any queued timer interrupts to get serviced */
1243 local_irq_enable();
1244 mdelay(1);
1245 local_irq_disable();
1246
1247 time_suspend();
1248
1249 remove_siblinginfo(cpu);
1250
1251 /* It's now safe to remove this processor from the online map */
1252 cpumask_clear_cpu(cpu, &cpu_online_map);
1253 fixup_irqs(&cpu_online_map, 1);
1254 fixup_eoi();
1255 }
1256
__cpu_die(unsigned int cpu)1257 void __cpu_die(unsigned int cpu)
1258 {
1259 /* We don't do anything here: idle task is faking death itself. */
1260 unsigned int i = 0;
1261 enum cpu_state seen_state;
1262
1263 while ( (seen_state = cpu_state) != CPU_STATE_DEAD )
1264 {
1265 BUG_ON(seen_state != CPU_STATE_DYING);
1266 mdelay(100);
1267 cpu_relax();
1268 process_pending_softirqs();
1269 if ( (++i % 10) == 0 )
1270 printk(KERN_ERR "CPU %u still not dead...\n", cpu);
1271 }
1272 }
1273
cpu_add(uint32_t apic_id,uint32_t acpi_id,uint32_t pxm)1274 int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm)
1275 {
1276 int cpu = -1;
1277
1278 dprintk(XENLOG_DEBUG, "cpu_add apic_id %x acpi_id %x pxm %x\n",
1279 apic_id, acpi_id, pxm);
1280
1281 if ( (acpi_id >= MAX_MADT_ENTRIES) ||
1282 (apic_id >= MAX_APICS) ||
1283 (pxm >= 256) )
1284 return -EINVAL;
1285
1286 cpu_hotplug_begin();
1287
1288 /* Detect if the cpu has been added before */
1289 if ( x86_acpiid_to_apicid[acpi_id] != BAD_APICID )
1290 {
1291 cpu = (x86_acpiid_to_apicid[acpi_id] != apic_id)
1292 ? -EINVAL : -EEXIST;
1293 goto out;
1294 }
1295
1296 if ( physid_isset(apic_id, phys_cpu_present_map) )
1297 {
1298 cpu = -EEXIST;
1299 goto out;
1300 }
1301
1302 if ( (cpu = mp_register_lapic(apic_id, 1, 1)) < 0 )
1303 goto out;
1304
1305 x86_acpiid_to_apicid[acpi_id] = apic_id;
1306
1307 if ( !srat_disabled() )
1308 {
1309 nodeid_t node = setup_node(pxm);
1310
1311 if ( node == NUMA_NO_NODE )
1312 {
1313 dprintk(XENLOG_WARNING,
1314 "Setup node failed for pxm %x\n", pxm);
1315 x86_acpiid_to_apicid[acpi_id] = BAD_APICID;
1316 mp_unregister_lapic(apic_id, cpu);
1317 cpu = -ENOSPC;
1318 goto out;
1319 }
1320 if ( apic_id < MAX_LOCAL_APIC )
1321 apicid_to_node[apic_id] = node;
1322 }
1323
1324 /* Physically added CPUs do not have synchronised TSC. */
1325 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1326 {
1327 printk_once(
1328 XENLOG_WARNING
1329 "New CPU %u may have skewed TSC and break cross-CPU TSC coherency\n"
1330 "Consider using \"tsc=skewed\" to force emulation where appropriate\n",
1331 cpu);
1332 cpumask_set_cpu(cpu, &tsc_sync_cpu_mask);
1333 }
1334
1335 srat_detect_node(cpu);
1336 numa_add_cpu(cpu);
1337 dprintk(XENLOG_INFO, "Add CPU %x with index %x\n", apic_id, cpu);
1338 out:
1339 cpu_hotplug_done();
1340 return cpu;
1341 }
1342
1343
__cpu_up(unsigned int cpu)1344 int __cpu_up(unsigned int cpu)
1345 {
1346 int apicid, ret;
1347
1348 if ( (apicid = x86_cpu_to_apicid[cpu]) == BAD_APICID )
1349 return -ENODEV;
1350
1351 if ( (!x2apic_enabled && apicid >= APIC_ALL_CPUS) ||
1352 (iommu_intremap != iommu_intremap_full && (apicid >> 8)) )
1353 {
1354 printk("Unsupported: APIC ID %#x in xAPIC mode w/o interrupt remapping\n",
1355 apicid);
1356 return -EINVAL;
1357 }
1358
1359 if ( (ret = do_boot_cpu(apicid, cpu)) != 0 )
1360 return ret;
1361
1362 time_latch_stamps();
1363
1364 set_cpu_state(CPU_STATE_ONLINE);
1365 while ( !cpu_online(cpu) )
1366 {
1367 cpu_relax();
1368 process_pending_softirqs();
1369 }
1370
1371 return 0;
1372 }
1373
1374
smp_cpus_done(void)1375 void __init smp_cpus_done(void)
1376 {
1377 if ( nmi_watchdog == NMI_LOCAL_APIC )
1378 {
1379 setup_apic_nmi_watchdog();
1380 check_nmi_watchdog();
1381 }
1382
1383 setup_ioapic_dest();
1384
1385 mtrr_save_state();
1386 mtrr_aps_sync_end();
1387 }
1388
smp_intr_init(void)1389 void __init smp_intr_init(void)
1390 {
1391 int irq, vector, seridx, cpu = smp_processor_id();
1392
1393 /*
1394 * IRQ0 must be given a fixed assignment and initialized,
1395 * because it's used before the IO-APIC is set up.
1396 */
1397 irq_to_desc(0)->arch.vector = IRQ0_VECTOR;
1398
1399 /*
1400 * Also ensure serial interrupts are high priority. We do not
1401 * want them to be blocked by unacknowledged guest-bound interrupts.
1402 */
1403 for ( seridx = 0; seridx <= SERHND_IDX; seridx++ )
1404 {
1405 if ( (irq = serial_irq(seridx)) < 0 )
1406 continue;
1407 vector = alloc_hipriority_vector();
1408 per_cpu(vector_irq, cpu)[vector] = irq;
1409 irq_to_desc(irq)->arch.vector = vector;
1410 cpumask_copy(irq_to_desc(irq)->arch.cpu_mask, &cpu_online_map);
1411 }
1412
1413 /* Direct IPI vectors. */
1414 set_direct_apic_vector(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
1415 set_direct_apic_vector(EVENT_CHECK_VECTOR, event_check_interrupt);
1416 set_direct_apic_vector(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1417 set_direct_apic_vector(CALL_FUNCTION_VECTOR, call_function_interrupt);
1418 }
1419