1 /******************************************************************************
2 * alternative.c
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <xen/delay.h>
19 #include <xen/types.h>
20 #include <asm/apic.h>
21 #include <asm/processor.h>
22 #include <asm/alternative.h>
23 #include <xen/init.h>
24 #include <asm/setup.h>
25 #include <asm/system.h>
26 #include <asm/traps.h>
27 #include <asm/nmi.h>
28 #include <asm/nops.h>
29 #include <xen/livepatch.h>
30
31 #define MAX_PATCH_LEN (255-1)
32
33 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
34
35 #ifdef K8_NOP1
36 static const unsigned char k8nops[] init_or_livepatch_const = {
37 K8_NOP1,
38 K8_NOP2,
39 K8_NOP3,
40 K8_NOP4,
41 K8_NOP5,
42 K8_NOP6,
43 K8_NOP7,
44 K8_NOP8,
45 K8_NOP9,
46 };
47 static const unsigned char * const k8_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
48 NULL,
49 k8nops,
50 k8nops + 1,
51 k8nops + 1 + 2,
52 k8nops + 1 + 2 + 3,
53 k8nops + 1 + 2 + 3 + 4,
54 k8nops + 1 + 2 + 3 + 4 + 5,
55 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
56 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
57 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
58 };
59 #endif
60
61 #ifdef P6_NOP1
62 static const unsigned char p6nops[] init_or_livepatch_const = {
63 P6_NOP1,
64 P6_NOP2,
65 P6_NOP3,
66 P6_NOP4,
67 P6_NOP5,
68 P6_NOP6,
69 P6_NOP7,
70 P6_NOP8,
71 P6_NOP9,
72 };
73 static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
74 NULL,
75 p6nops,
76 p6nops + 1,
77 p6nops + 1 + 2,
78 p6nops + 1 + 2 + 3,
79 p6nops + 1 + 2 + 3 + 4,
80 p6nops + 1 + 2 + 3 + 4 + 5,
81 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
82 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
83 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
84 };
85 #endif
86
87 static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
88
89 #ifdef HAVE_AS_NOPS_DIRECTIVE
90
91 /* Nops in .init.rodata to compare against the runtime ideal nops. */
92 asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
93 "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
94 ".popsection\n\t");
95 extern char toolchain_nops[ASM_NOP_MAX];
96 static bool init_or_livepatch_read_mostly toolchain_nops_are_ideal;
97
98 #else
99 # define toolchain_nops_are_ideal false
100 #endif
101
arch_init_ideal_nops(void)102 static void __init arch_init_ideal_nops(void)
103 {
104 switch ( boot_cpu_data.x86_vendor )
105 {
106 case X86_VENDOR_INTEL:
107 /*
108 * Due to a decoder implementation quirk, some specific Intel CPUs
109 * actually perform better with the "k8_nops" than with the SDM-
110 * recommended NOPs.
111 */
112 if ( boot_cpu_data.x86 != 6 )
113 break;
114
115 switch ( boot_cpu_data.x86_model )
116 {
117 case 0x0f ... 0x1b:
118 case 0x1d ... 0x25:
119 case 0x28 ... 0x2f:
120 ideal_nops = k8_nops;
121 break;
122 }
123 break;
124
125 case X86_VENDOR_AMD:
126 if ( boot_cpu_data.x86 <= 0xf )
127 ideal_nops = k8_nops;
128 break;
129 }
130
131 #ifdef HAVE_AS_NOPS_DIRECTIVE
132 if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
133 toolchain_nops_are_ideal = true;
134 #endif
135 }
136
137 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
add_nops(void * insns,unsigned int len)138 void init_or_livepatch add_nops(void *insns, unsigned int len)
139 {
140 while ( len > 0 )
141 {
142 unsigned int noplen = len;
143 if ( noplen > ASM_NOP_MAX )
144 noplen = ASM_NOP_MAX;
145 memcpy(insns, ideal_nops[noplen], noplen);
146 insns += noplen;
147 len -= noplen;
148 }
149 }
150
151 /*
152 * text_poke - Update instructions on a live kernel or non-executed code.
153 * @addr: address to modify
154 * @opcode: source of the copy
155 * @len: length to copy
156 *
157 * When you use this code to patch more than one byte of an instruction
158 * you need to make sure that other CPUs cannot execute this code in parallel.
159 * Also no thread must be currently preempted in the middle of these
160 * instructions. And on the local CPU you need to be protected again NMI or MCE
161 * handlers seeing an inconsistent instruction while you patch.
162 *
163 * You should run this with interrupts disabled or on code that is not
164 * executing.
165 *
166 * "noinline" to cause control flow change and thus invalidate I$ and
167 * cause refetch after modification.
168 */
169 static void *init_or_livepatch noinline
text_poke(void * addr,const void * opcode,size_t len)170 text_poke(void *addr, const void *opcode, size_t len)
171 {
172 return memcpy(addr, opcode, len);
173 }
174
175 /*
176 * Replace instructions with better alternatives for this CPU type.
177 * This runs before SMP is initialized to avoid SMP problems with
178 * self modifying code. This implies that asymmetric systems where
179 * APs have less capabilities than the boot processor are not handled.
180 * Tough. Make sure you disable such features by hand.
181 *
182 * The caller will set the "force" argument to true for the final
183 * invocation, such that no CALLs/JMPs to NULL pointers will be left
184 * around. See also the further comment below.
185 */
_apply_alternatives(struct alt_instr * start,struct alt_instr * end,bool force)186 static void init_or_livepatch _apply_alternatives(struct alt_instr *start,
187 struct alt_instr *end,
188 bool force)
189 {
190 struct alt_instr *a, *base;
191
192 printk(KERN_INFO "alt table %p -> %p\n", start, end);
193
194 /*
195 * The scan order should be from start to end. A later scanned
196 * alternative code can overwrite a previous scanned alternative code.
197 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
198 * patch code.
199 *
200 * So be careful if you want to change the scan order to any other
201 * order.
202 */
203 for ( a = base = start; a < end; a++ )
204 {
205 uint8_t *orig = ALT_ORIG_PTR(a);
206 uint8_t *repl = ALT_REPL_PTR(a);
207 uint8_t buf[MAX_PATCH_LEN];
208 unsigned int total_len = a->orig_len + a->pad_len;
209
210 BUG_ON(a->repl_len > total_len);
211 BUG_ON(total_len > sizeof(buf));
212 BUG_ON(a->cpuid >= NCAPINTS * 32);
213
214 /*
215 * Detect sequences of alt_instr's patching the same origin site, and
216 * keep base pointing at the first alt_instr entry. This is so we can
217 * refer to a single ->priv field for some of our patching decisions,
218 * in particular the NOP optimization. We deliberately use the alt_instr
219 * itself rather than a local variable in case we end up making multiple
220 * passes.
221 *
222 * ->priv being nonzero means that the origin site has already been
223 * modified, and we shouldn't try to optimise the nops again.
224 */
225 if ( ALT_ORIG_PTR(base) != orig )
226 base = a;
227
228 /* Skip patch sites already handled during the first pass. */
229 if ( a->priv )
230 {
231 ASSERT(force);
232 continue;
233 }
234
235 /* If there is no replacement to make, see about optimising the nops. */
236 if ( !boot_cpu_has(a->cpuid) )
237 {
238 /* Origin site site already touched? Don't nop anything. */
239 if ( base->priv )
240 continue;
241
242 a->priv = 1;
243
244 /* Nothing useful to do? */
245 if ( toolchain_nops_are_ideal || a->pad_len <= 1 )
246 continue;
247
248 add_nops(buf, a->pad_len);
249 text_poke(orig + a->orig_len, buf, a->pad_len);
250 continue;
251 }
252
253 memcpy(buf, repl, a->repl_len);
254
255 /* 0xe8/0xe9 are relative branches; fix the offset. */
256 if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 )
257 {
258 /*
259 * Detect the special case of indirect-to-direct branch patching:
260 * - replacement is a direct CALL/JMP (opcodes 0xE8/0xE9; already
261 * checked above),
262 * - replacement's displacement is -5 (pointing back at the very
263 * insn, which makes no sense in a real replacement insn),
264 * - original is an indirect CALL/JMP (opcodes 0xFF/2 or 0xFF/4)
265 * using RIP-relative addressing.
266 * Some branch destinations may still be NULL when we come here
267 * the first time. Defer patching of those until the post-presmp-
268 * initcalls re-invocation (with force set to true). If at that
269 * point the branch destination is still NULL, insert "UD2; UD0"
270 * (for ease of recognition) instead of CALL/JMP.
271 */
272 if ( a->cpuid == X86_FEATURE_ALWAYS &&
273 *(int32_t *)(buf + 1) == -5 &&
274 a->orig_len >= 6 &&
275 orig[0] == 0xff &&
276 orig[1] == (*buf & 1 ? 0x25 : 0x15) )
277 {
278 long disp = *(int32_t *)(orig + 2);
279 const uint8_t *dest = *(void **)(orig + 6 + disp);
280
281 if ( dest )
282 {
283 disp = dest - (orig + 5);
284 ASSERT(disp == (int32_t)disp);
285 *(int32_t *)(buf + 1) = disp;
286 }
287 else if ( force )
288 {
289 buf[0] = 0x0f;
290 buf[1] = 0x0b;
291 buf[2] = 0x0f;
292 buf[3] = 0xff;
293 buf[4] = 0xff;
294 }
295 else
296 continue;
297 }
298 else if ( force && system_state < SYS_STATE_active )
299 ASSERT_UNREACHABLE();
300 else
301 *(int32_t *)(buf + 1) += repl - orig;
302 }
303 else if ( force && system_state < SYS_STATE_active )
304 ASSERT_UNREACHABLE();
305
306 a->priv = 1;
307
308 add_nops(buf + a->repl_len, total_len - a->repl_len);
309 text_poke(orig, buf, total_len);
310 }
311 }
312
apply_alternatives(struct alt_instr * start,struct alt_instr * end)313 void init_or_livepatch apply_alternatives(struct alt_instr *start,
314 struct alt_instr *end)
315 {
316 _apply_alternatives(start, end, true);
317 }
318
319 static unsigned int __initdata alt_todo;
320 static unsigned int __initdata alt_done;
321
322 /*
323 * At boot time, we patch alternatives in NMI context. This means that the
324 * active NMI-shadow will defer any further NMIs, removing the slim race
325 * condition where an NMI hits while we are midway though patching some
326 * instructions in the NMI path.
327 */
nmi_apply_alternatives(const struct cpu_user_regs * regs,int cpu)328 static int __init nmi_apply_alternatives(const struct cpu_user_regs *regs,
329 int cpu)
330 {
331 /*
332 * More than one NMI may occur between the two set_nmi_callback() below.
333 * We only need to apply alternatives once.
334 */
335 if ( !(alt_done & alt_todo) )
336 {
337 unsigned long cr0;
338
339 cr0 = read_cr0();
340
341 /* Disable WP to allow patching read-only pages. */
342 write_cr0(cr0 & ~X86_CR0_WP);
343
344 _apply_alternatives(__alt_instructions, __alt_instructions_end,
345 alt_done);
346
347 write_cr0(cr0);
348
349 alt_done |= alt_todo;
350 }
351
352 return 1;
353 }
354
355 /*
356 * This routine is called with local interrupt disabled and used during
357 * bootup.
358 */
_alternative_instructions(bool force)359 static void __init _alternative_instructions(bool force)
360 {
361 unsigned int i;
362 nmi_callback_t *saved_nmi_callback;
363
364 /*
365 * Don't stop machine check exceptions while patching.
366 * MCEs only happen when something got corrupted and in this
367 * case we must do something about the corruption.
368 * Ignoring it is worse than a unlikely patching race.
369 * Also machine checks tend to be broadcast and if one CPU
370 * goes into machine check the others follow quickly, so we don't
371 * expect a machine check to cause undue problems during to code
372 * patching.
373 */
374 ASSERT(!local_irq_is_enabled());
375
376 /* Set what operation to perform /before/ setting the callback. */
377 alt_todo = 1u << force;
378 barrier();
379
380 /*
381 * As soon as the callback is set up, the next NMI will trigger patching,
382 * even an NMI ahead of our explicit self-NMI.
383 */
384 saved_nmi_callback = set_nmi_callback(nmi_apply_alternatives);
385
386 /* Send ourselves an NMI to trigger the callback. */
387 self_nmi();
388
389 /*
390 * In practice, the self_nmi() above appears to act synchronously.
391 * However, synchronous behaviour is not architecturally guaranteed. To
392 * cover the (hopefully never) async case, poll alt_done for up to one
393 * second.
394 */
395 for ( i = 0; !(ACCESS_ONCE(alt_done) & alt_todo) && i < 1000; ++i )
396 mdelay(1);
397
398 if ( !(ACCESS_ONCE(alt_done) & alt_todo) )
399 panic("Timed out waiting for alternatives self-NMI to hit\n");
400
401 set_nmi_callback(saved_nmi_callback);
402
403 /*
404 * When Xen is using shadow stacks, the alternatives clearing CR0.WP and
405 * writing into the mappings set dirty bits, turning the mappings into
406 * shadow stack mappings.
407 *
408 * While we can execute from them, this would also permit them to be the
409 * target of WRSS instructions, so reset the dirty after patching.
410 */
411 if ( cpu_has_xen_shstk )
412 modify_xen_mappings(XEN_VIRT_START + MB(2),
413 (unsigned long)&__2M_text_end,
414 PAGE_HYPERVISOR_RX);
415 }
416
alternative_instructions(void)417 void __init alternative_instructions(void)
418 {
419 arch_init_ideal_nops();
420 _alternative_instructions(false);
421 }
422
alternative_branches(void)423 void __init alternative_branches(void)
424 {
425 local_irq_disable();
426 _alternative_instructions(true);
427 local_irq_enable();
428 }
429