1 /******************************************************************************
2  * alternative.c
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <xen/delay.h>
19 #include <xen/types.h>
20 #include <asm/apic.h>
21 #include <asm/processor.h>
22 #include <asm/alternative.h>
23 #include <xen/init.h>
24 #include <asm/setup.h>
25 #include <asm/system.h>
26 #include <asm/traps.h>
27 #include <asm/nmi.h>
28 #include <asm/nops.h>
29 #include <xen/livepatch.h>
30 
31 #define MAX_PATCH_LEN (255-1)
32 
33 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
34 
35 #ifdef K8_NOP1
36 static const unsigned char k8nops[] init_or_livepatch_const = {
37     K8_NOP1,
38     K8_NOP2,
39     K8_NOP3,
40     K8_NOP4,
41     K8_NOP5,
42     K8_NOP6,
43     K8_NOP7,
44     K8_NOP8,
45     K8_NOP9,
46 };
47 static const unsigned char * const k8_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
48     NULL,
49     k8nops,
50     k8nops + 1,
51     k8nops + 1 + 2,
52     k8nops + 1 + 2 + 3,
53     k8nops + 1 + 2 + 3 + 4,
54     k8nops + 1 + 2 + 3 + 4 + 5,
55     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
56     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
57     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
58 };
59 #endif
60 
61 #ifdef P6_NOP1
62 static const unsigned char p6nops[] init_or_livepatch_const = {
63     P6_NOP1,
64     P6_NOP2,
65     P6_NOP3,
66     P6_NOP4,
67     P6_NOP5,
68     P6_NOP6,
69     P6_NOP7,
70     P6_NOP8,
71     P6_NOP9,
72 };
73 static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
74     NULL,
75     p6nops,
76     p6nops + 1,
77     p6nops + 1 + 2,
78     p6nops + 1 + 2 + 3,
79     p6nops + 1 + 2 + 3 + 4,
80     p6nops + 1 + 2 + 3 + 4 + 5,
81     p6nops + 1 + 2 + 3 + 4 + 5 + 6,
82     p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
83     p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
84 };
85 #endif
86 
87 static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
88 
89 #ifdef HAVE_AS_NOPS_DIRECTIVE
90 
91 /* Nops in .init.rodata to compare against the runtime ideal nops. */
92 asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
93       "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
94       ".popsection\n\t");
95 extern char toolchain_nops[ASM_NOP_MAX];
96 static bool init_or_livepatch_read_mostly toolchain_nops_are_ideal;
97 
98 #else
99 # define toolchain_nops_are_ideal false
100 #endif
101 
arch_init_ideal_nops(void)102 static void __init arch_init_ideal_nops(void)
103 {
104     switch ( boot_cpu_data.x86_vendor )
105     {
106     case X86_VENDOR_INTEL:
107         /*
108          * Due to a decoder implementation quirk, some specific Intel CPUs
109          * actually perform better with the "k8_nops" than with the SDM-
110          * recommended NOPs.
111          */
112         if ( boot_cpu_data.x86 != 6 )
113             break;
114 
115         switch ( boot_cpu_data.x86_model )
116         {
117         case 0x0f ... 0x1b:
118         case 0x1d ... 0x25:
119         case 0x28 ... 0x2f:
120             ideal_nops = k8_nops;
121             break;
122         }
123         break;
124 
125     case X86_VENDOR_AMD:
126         if ( boot_cpu_data.x86 <= 0xf )
127             ideal_nops = k8_nops;
128         break;
129     }
130 
131 #ifdef HAVE_AS_NOPS_DIRECTIVE
132     if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
133         toolchain_nops_are_ideal = true;
134 #endif
135 }
136 
137 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
add_nops(void * insns,unsigned int len)138 void init_or_livepatch add_nops(void *insns, unsigned int len)
139 {
140     while ( len > 0 )
141     {
142         unsigned int noplen = len;
143         if ( noplen > ASM_NOP_MAX )
144             noplen = ASM_NOP_MAX;
145         memcpy(insns, ideal_nops[noplen], noplen);
146         insns += noplen;
147         len -= noplen;
148     }
149 }
150 
151 /*
152  * text_poke - Update instructions on a live kernel or non-executed code.
153  * @addr: address to modify
154  * @opcode: source of the copy
155  * @len: length to copy
156  *
157  * When you use this code to patch more than one byte of an instruction
158  * you need to make sure that other CPUs cannot execute this code in parallel.
159  * Also no thread must be currently preempted in the middle of these
160  * instructions. And on the local CPU you need to be protected again NMI or MCE
161  * handlers seeing an inconsistent instruction while you patch.
162  *
163  * You should run this with interrupts disabled or on code that is not
164  * executing.
165  *
166  * "noinline" to cause control flow change and thus invalidate I$ and
167  * cause refetch after modification.
168  */
169 static void *init_or_livepatch noinline
text_poke(void * addr,const void * opcode,size_t len)170 text_poke(void *addr, const void *opcode, size_t len)
171 {
172     return memcpy(addr, opcode, len);
173 }
174 
175 /*
176  * Replace instructions with better alternatives for this CPU type.
177  * This runs before SMP is initialized to avoid SMP problems with
178  * self modifying code. This implies that asymmetric systems where
179  * APs have less capabilities than the boot processor are not handled.
180  * Tough. Make sure you disable such features by hand.
181  *
182  * The caller will set the "force" argument to true for the final
183  * invocation, such that no CALLs/JMPs to NULL pointers will be left
184  * around. See also the further comment below.
185  */
_apply_alternatives(struct alt_instr * start,struct alt_instr * end,bool force)186 static void init_or_livepatch _apply_alternatives(struct alt_instr *start,
187                                                   struct alt_instr *end,
188                                                   bool force)
189 {
190     struct alt_instr *a, *base;
191 
192     printk(KERN_INFO "alt table %p -> %p\n", start, end);
193 
194     /*
195      * The scan order should be from start to end. A later scanned
196      * alternative code can overwrite a previous scanned alternative code.
197      * Some kernel functions (e.g. memcpy, memset, etc) use this order to
198      * patch code.
199      *
200      * So be careful if you want to change the scan order to any other
201      * order.
202      */
203     for ( a = base = start; a < end; a++ )
204     {
205         uint8_t *orig = ALT_ORIG_PTR(a);
206         uint8_t *repl = ALT_REPL_PTR(a);
207         uint8_t buf[MAX_PATCH_LEN];
208         unsigned int total_len = a->orig_len + a->pad_len;
209 
210         BUG_ON(a->repl_len > total_len);
211         BUG_ON(total_len > sizeof(buf));
212         BUG_ON(a->cpuid >= NCAPINTS * 32);
213 
214         /*
215          * Detect sequences of alt_instr's patching the same origin site, and
216          * keep base pointing at the first alt_instr entry.  This is so we can
217          * refer to a single ->priv field for some of our patching decisions,
218          * in particular the NOP optimization. We deliberately use the alt_instr
219          * itself rather than a local variable in case we end up making multiple
220          * passes.
221          *
222          * ->priv being nonzero means that the origin site has already been
223          * modified, and we shouldn't try to optimise the nops again.
224          */
225         if ( ALT_ORIG_PTR(base) != orig )
226             base = a;
227 
228         /* Skip patch sites already handled during the first pass. */
229         if ( a->priv )
230         {
231             ASSERT(force);
232             continue;
233         }
234 
235         /* If there is no replacement to make, see about optimising the nops. */
236         if ( !boot_cpu_has(a->cpuid) )
237         {
238             /* Origin site site already touched?  Don't nop anything. */
239             if ( base->priv )
240                 continue;
241 
242             a->priv = 1;
243 
244             /* Nothing useful to do? */
245             if ( toolchain_nops_are_ideal || a->pad_len <= 1 )
246                 continue;
247 
248             add_nops(buf, a->pad_len);
249             text_poke(orig + a->orig_len, buf, a->pad_len);
250             continue;
251         }
252 
253         memcpy(buf, repl, a->repl_len);
254 
255         /* 0xe8/0xe9 are relative branches; fix the offset. */
256         if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 )
257         {
258             /*
259              * Detect the special case of indirect-to-direct branch patching:
260              * - replacement is a direct CALL/JMP (opcodes 0xE8/0xE9; already
261              *   checked above),
262              * - replacement's displacement is -5 (pointing back at the very
263              *   insn, which makes no sense in a real replacement insn),
264              * - original is an indirect CALL/JMP (opcodes 0xFF/2 or 0xFF/4)
265              *   using RIP-relative addressing.
266              * Some branch destinations may still be NULL when we come here
267              * the first time. Defer patching of those until the post-presmp-
268              * initcalls re-invocation (with force set to true). If at that
269              * point the branch destination is still NULL, insert "UD2; UD0"
270              * (for ease of recognition) instead of CALL/JMP.
271              */
272             if ( a->cpuid == X86_FEATURE_ALWAYS &&
273                  *(int32_t *)(buf + 1) == -5 &&
274                  a->orig_len >= 6 &&
275                  orig[0] == 0xff &&
276                  orig[1] == (*buf & 1 ? 0x25 : 0x15) )
277             {
278                 long disp = *(int32_t *)(orig + 2);
279                 const uint8_t *dest = *(void **)(orig + 6 + disp);
280 
281                 if ( dest )
282                 {
283                     disp = dest - (orig + 5);
284                     ASSERT(disp == (int32_t)disp);
285                     *(int32_t *)(buf + 1) = disp;
286                 }
287                 else if ( force )
288                 {
289                     buf[0] = 0x0f;
290                     buf[1] = 0x0b;
291                     buf[2] = 0x0f;
292                     buf[3] = 0xff;
293                     buf[4] = 0xff;
294                 }
295                 else
296                     continue;
297             }
298             else if ( force && system_state < SYS_STATE_active )
299                 ASSERT_UNREACHABLE();
300             else
301                 *(int32_t *)(buf + 1) += repl - orig;
302         }
303         else if ( force && system_state < SYS_STATE_active  )
304             ASSERT_UNREACHABLE();
305 
306         a->priv = 1;
307 
308         add_nops(buf + a->repl_len, total_len - a->repl_len);
309         text_poke(orig, buf, total_len);
310     }
311 }
312 
apply_alternatives(struct alt_instr * start,struct alt_instr * end)313 void init_or_livepatch apply_alternatives(struct alt_instr *start,
314                                           struct alt_instr *end)
315 {
316     _apply_alternatives(start, end, true);
317 }
318 
319 static unsigned int __initdata alt_todo;
320 static unsigned int __initdata alt_done;
321 
322 /*
323  * At boot time, we patch alternatives in NMI context.  This means that the
324  * active NMI-shadow will defer any further NMIs, removing the slim race
325  * condition where an NMI hits while we are midway though patching some
326  * instructions in the NMI path.
327  */
nmi_apply_alternatives(const struct cpu_user_regs * regs,int cpu)328 static int __init nmi_apply_alternatives(const struct cpu_user_regs *regs,
329                                          int cpu)
330 {
331     /*
332      * More than one NMI may occur between the two set_nmi_callback() below.
333      * We only need to apply alternatives once.
334      */
335     if ( !(alt_done & alt_todo) )
336     {
337         unsigned long cr0;
338 
339         cr0 = read_cr0();
340 
341         /* Disable WP to allow patching read-only pages. */
342         write_cr0(cr0 & ~X86_CR0_WP);
343 
344         _apply_alternatives(__alt_instructions, __alt_instructions_end,
345                             alt_done);
346 
347         write_cr0(cr0);
348 
349         alt_done |= alt_todo;
350     }
351 
352     return 1;
353 }
354 
355 /*
356  * This routine is called with local interrupt disabled and used during
357  * bootup.
358  */
_alternative_instructions(bool force)359 static void __init _alternative_instructions(bool force)
360 {
361     unsigned int i;
362     nmi_callback_t *saved_nmi_callback;
363 
364     /*
365      * Don't stop machine check exceptions while patching.
366      * MCEs only happen when something got corrupted and in this
367      * case we must do something about the corruption.
368      * Ignoring it is worse than a unlikely patching race.
369      * Also machine checks tend to be broadcast and if one CPU
370      * goes into machine check the others follow quickly, so we don't
371      * expect a machine check to cause undue problems during to code
372      * patching.
373      */
374     ASSERT(!local_irq_is_enabled());
375 
376     /* Set what operation to perform /before/ setting the callback. */
377     alt_todo = 1u << force;
378     barrier();
379 
380     /*
381      * As soon as the callback is set up, the next NMI will trigger patching,
382      * even an NMI ahead of our explicit self-NMI.
383      */
384     saved_nmi_callback = set_nmi_callback(nmi_apply_alternatives);
385 
386     /* Send ourselves an NMI to trigger the callback. */
387     self_nmi();
388 
389     /*
390      * In practice, the self_nmi() above appears to act synchronously.
391      * However, synchronous behaviour is not architecturally guaranteed.  To
392      * cover the (hopefully never) async case, poll alt_done for up to one
393      * second.
394      */
395     for ( i = 0; !(ACCESS_ONCE(alt_done) & alt_todo) && i < 1000; ++i )
396         mdelay(1);
397 
398     if ( !(ACCESS_ONCE(alt_done) & alt_todo) )
399         panic("Timed out waiting for alternatives self-NMI to hit\n");
400 
401     set_nmi_callback(saved_nmi_callback);
402 
403     /*
404      * When Xen is using shadow stacks, the alternatives clearing CR0.WP and
405      * writing into the mappings set dirty bits, turning the mappings into
406      * shadow stack mappings.
407      *
408      * While we can execute from them, this would also permit them to be the
409      * target of WRSS instructions, so reset the dirty after patching.
410      */
411     if ( cpu_has_xen_shstk )
412         modify_xen_mappings(XEN_VIRT_START + MB(2),
413                             (unsigned long)&__2M_text_end,
414                             PAGE_HYPERVISOR_RX);
415 }
416 
alternative_instructions(void)417 void __init alternative_instructions(void)
418 {
419     arch_init_ideal_nops();
420     _alternative_instructions(false);
421 }
422 
alternative_branches(void)423 void __init alternative_branches(void)
424 {
425     local_irq_disable();
426     _alternative_instructions(true);
427     local_irq_enable();
428 }
429