1/*
2 * Copyright (c) 2013-2021, Arm Limited and Contributors. All rights reserved.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#include <arch.h>
8#include <asm_macros.S>
9#include <assert_macros.S>
10#include <common/bl_common.h>
11#include <lib/xlat_tables/xlat_tables_defs.h>
12
13	.globl	smc
14
15	.globl	zero_normalmem
16	.globl	zeromem
17	.globl	memcpy16
18
19	.globl	disable_mmu_el1
20	.globl	disable_mmu_el3
21	.globl	disable_mmu_icache_el1
22	.globl	disable_mmu_icache_el3
23	.globl	fixup_gdt_reloc
24#if SUPPORT_VFP
25	.globl	enable_vfp
26#endif
27
28func smc
29	smc	#0
30endfunc smc
31
32/* -----------------------------------------------------------------------
33 * void zero_normalmem(void *mem, unsigned int length);
34 *
35 * Initialise a region in normal memory to 0. This functions complies with the
36 * AAPCS and can be called from C code.
37 *
38 * NOTE: MMU must be enabled when using this function as it can only operate on
39 *       normal memory. It is intended to be mainly used from C code when MMU
40 *       is usually enabled.
41 * -----------------------------------------------------------------------
42 */
43.equ	zero_normalmem, zeromem_dczva
44
45/* -----------------------------------------------------------------------
46 * void zeromem(void *mem, unsigned int length);
47 *
48 * Initialise a region of device memory to 0. This functions complies with the
49 * AAPCS and can be called from C code.
50 *
51 * NOTE: When data caches and MMU are enabled, zero_normalmem can usually be
52 *       used instead for faster zeroing.
53 *
54 * -----------------------------------------------------------------------
55 */
56func zeromem
57	/* x2 is the address past the last zeroed address */
58	add	x2, x0, x1
59	/*
60	 * Uses the fallback path that does not use DC ZVA instruction and
61	 * therefore does not need enabled MMU
62	 */
63	b	.Lzeromem_dczva_fallback_entry
64endfunc zeromem
65
66/* -----------------------------------------------------------------------
67 * void zeromem_dczva(void *mem, unsigned int length);
68 *
69 * Fill a region of normal memory of size "length" in bytes with null bytes.
70 * MMU must be enabled and the memory be of
71 * normal type. This is because this function internally uses the DC ZVA
72 * instruction, which generates an Alignment fault if used on any type of
73 * Device memory (see section D3.4.9 of the ARMv8 ARM, issue k). When the MMU
74 * is disabled, all memory behaves like Device-nGnRnE memory (see section
75 * D4.2.8), hence the requirement on the MMU being enabled.
76 * NOTE: The code assumes that the block size as defined in DCZID_EL0
77 *       register is at least 16 bytes.
78 *
79 * -----------------------------------------------------------------------
80 */
81func zeromem_dczva
82
83	/*
84	 * The function consists of a series of loops that zero memory one byte
85	 * at a time, 16 bytes at a time or using the DC ZVA instruction to
86	 * zero aligned block of bytes, which is assumed to be more than 16.
87	 * In the case where the DC ZVA instruction cannot be used or if the
88	 * first 16 bytes loop would overflow, there is fallback path that does
89	 * not use DC ZVA.
90	 * Note: The fallback path is also used by the zeromem function that
91	 *       branches to it directly.
92	 *
93	 *              +---------+   zeromem_dczva
94	 *              |  entry  |
95	 *              +----+----+
96	 *                   |
97	 *                   v
98	 *              +---------+
99	 *              | checks  |>o-------+ (If any check fails, fallback)
100	 *              +----+----+         |
101	 *                   |              |---------------+
102	 *                   v              | Fallback path |
103	 *            +------+------+       |---------------+
104	 *            | 1 byte loop |       |
105	 *            +------+------+ .Lzeromem_dczva_initial_1byte_aligned_end
106	 *                   |              |
107	 *                   v              |
108	 *           +-------+-------+      |
109	 *           | 16 bytes loop |      |
110	 *           +-------+-------+      |
111	 *                   |              |
112	 *                   v              |
113	 *            +------+------+ .Lzeromem_dczva_blocksize_aligned
114	 *            | DC ZVA loop |       |
115	 *            +------+------+       |
116	 *       +--------+  |              |
117	 *       |        |  |              |
118	 *       |        v  v              |
119	 *       |   +-------+-------+ .Lzeromem_dczva_final_16bytes_aligned
120	 *       |   | 16 bytes loop |      |
121	 *       |   +-------+-------+      |
122	 *       |           |              |
123	 *       |           v              |
124	 *       |    +------+------+ .Lzeromem_dczva_final_1byte_aligned
125	 *       |    | 1 byte loop |       |
126	 *       |    +-------------+       |
127	 *       |           |              |
128	 *       |           v              |
129	 *       |       +---+--+           |
130	 *       |       | exit |           |
131	 *       |       +------+           |
132	 *       |			    |
133	 *       |           +--------------+    +------------------+ zeromem
134	 *       |           |  +----------------| zeromem function |
135	 *       |           |  |                +------------------+
136	 *       |           v  v
137	 *       |    +-------------+ .Lzeromem_dczva_fallback_entry
138	 *       |    | 1 byte loop |
139	 *       |    +------+------+
140	 *       |           |
141	 *       +-----------+
142	 */
143
144	/*
145	 * Readable names for registers
146	 *
147	 * Registers x0, x1 and x2 are also set by zeromem which
148	 * branches into the fallback path directly, so cursor, length and
149	 * stop_address should not be retargeted to other registers.
150	 */
151	cursor       .req x0 /* Start address and then current address */
152	length       .req x1 /* Length in bytes of the region to zero out */
153	/* Reusing x1 as length is never used after block_mask is set */
154	block_mask   .req x1 /* Bitmask of the block size read in DCZID_EL0 */
155	stop_address .req x2 /* Address past the last zeroed byte */
156	block_size   .req x3 /* Size of a block in bytes as read in DCZID_EL0 */
157	tmp1         .req x4
158	tmp2         .req x5
159
160#if ENABLE_ASSERTIONS
161	/*
162	 * Check for M bit (MMU enabled) of the current SCTLR_EL(1|3)
163	 * register value and panic if the MMU is disabled.
164	 */
165#if defined(IMAGE_BL1) || defined(IMAGE_BL31) || (defined(IMAGE_BL2) && BL2_AT_EL3)
166	mrs	tmp1, sctlr_el3
167#else
168	mrs	tmp1, sctlr_el1
169#endif
170
171	tst	tmp1, #SCTLR_M_BIT
172	ASM_ASSERT(ne)
173#endif /* ENABLE_ASSERTIONS */
174
175	/* stop_address is the address past the last to zero */
176	add	stop_address, cursor, length
177
178	/*
179	 * Get block_size = (log2(<block size>) >> 2) (see encoding of
180	 * dczid_el0 reg)
181	 */
182	mrs	block_size, dczid_el0
183
184	/*
185	 * Select the 4 lowest bits and convert the extracted log2(<block size
186	 * in words>) to <block size in bytes>
187	 */
188	ubfx	block_size, block_size, #0, #4
189	mov	tmp2, #(1 << 2)
190	lsl	block_size, tmp2, block_size
191
192#if ENABLE_ASSERTIONS
193	/*
194	 * Assumes block size is at least 16 bytes to avoid manual realignment
195	 * of the cursor at the end of the DCZVA loop.
196	 */
197	cmp	block_size, #16
198	ASM_ASSERT(hs)
199#endif
200	/*
201	 * Not worth doing all the setup for a region less than a block and
202	 * protects against zeroing a whole block when the area to zero is
203	 * smaller than that. Also, as it is assumed that the block size is at
204	 * least 16 bytes, this also protects the initial aligning loops from
205	 * trying to zero 16 bytes when length is less than 16.
206	 */
207	cmp	length, block_size
208	b.lo	.Lzeromem_dczva_fallback_entry
209
210	/*
211	 * Calculate the bitmask of the block alignment. It will never
212	 * underflow as the block size is between 4 bytes and 2kB.
213	 * block_mask = block_size - 1
214	 */
215	sub	block_mask, block_size, #1
216
217	/*
218	 * length alias should not be used after this point unless it is
219	 * defined as a register other than block_mask's.
220	 */
221	 .unreq length
222
223	/*
224	 * If the start address is already aligned to zero block size, go
225	 * straight to the cache zeroing loop. This is safe because at this
226	 * point, the length cannot be smaller than a block size.
227	 */
228	tst	cursor, block_mask
229	b.eq	.Lzeromem_dczva_blocksize_aligned
230
231	/*
232	 * Calculate the first block-size-aligned address. It is assumed that
233	 * the zero block size is at least 16 bytes. This address is the last
234	 * address of this initial loop.
235	 */
236	orr	tmp1, cursor, block_mask
237	add	tmp1, tmp1, #1
238
239	/*
240	 * If the addition overflows, skip the cache zeroing loops. This is
241	 * quite unlikely however.
242	 */
243	cbz	tmp1, .Lzeromem_dczva_fallback_entry
244
245	/*
246	 * If the first block-size-aligned address is past the last address,
247	 * fallback to the simpler code.
248	 */
249	cmp	tmp1, stop_address
250	b.hi	.Lzeromem_dczva_fallback_entry
251
252	/*
253	 * If the start address is already aligned to 16 bytes, skip this loop.
254	 * It is safe to do this because tmp1 (the stop address of the initial
255	 * 16 bytes loop) will never be greater than the final stop address.
256	 */
257	tst	cursor, #0xf
258	b.eq	.Lzeromem_dczva_initial_1byte_aligned_end
259
260	/* Calculate the next address aligned to 16 bytes */
261	orr	tmp2, cursor, #0xf
262	add	tmp2, tmp2, #1
263	/* If it overflows, fallback to the simple path (unlikely) */
264	cbz	tmp2, .Lzeromem_dczva_fallback_entry
265	/*
266	 * Next aligned address cannot be after the stop address because the
267	 * length cannot be smaller than 16 at this point.
268	 */
269
270	/* First loop: zero byte per byte */
2711:
272	strb	wzr, [cursor], #1
273	cmp	cursor, tmp2
274	b.ne	1b
275.Lzeromem_dczva_initial_1byte_aligned_end:
276
277	/*
278	 * Second loop: we need to zero 16 bytes at a time from cursor to tmp1
279	 * before being able to use the code that deals with block-size-aligned
280	 * addresses.
281	 */
282	cmp	cursor, tmp1
283	b.hs	2f
2841:
285	stp	xzr, xzr, [cursor], #16
286	cmp	cursor, tmp1
287	b.lo	1b
2882:
289
290	/*
291	 * Third loop: zero a block at a time using DC ZVA cache block zeroing
292	 * instruction.
293	 */
294.Lzeromem_dczva_blocksize_aligned:
295	/*
296	 * Calculate the last block-size-aligned address. If the result equals
297	 * to the start address, the loop will exit immediately.
298	 */
299	bic	tmp1, stop_address, block_mask
300
301	cmp	cursor, tmp1
302	b.hs	2f
3031:
304	/* Zero the block containing the cursor */
305	dc	zva, cursor
306	/* Increment the cursor by the size of a block */
307	add	cursor, cursor, block_size
308	cmp	cursor, tmp1
309	b.lo	1b
3102:
311
312	/*
313	 * Fourth loop: zero 16 bytes at a time and then byte per byte the
314	 * remaining area
315	 */
316.Lzeromem_dczva_final_16bytes_aligned:
317	/*
318	 * Calculate the last 16 bytes aligned address. It is assumed that the
319	 * block size will never be smaller than 16 bytes so that the current
320	 * cursor is aligned to at least 16 bytes boundary.
321	 */
322	bic	tmp1, stop_address, #15
323
324	cmp	cursor, tmp1
325	b.hs	2f
3261:
327	stp	xzr, xzr, [cursor], #16
328	cmp	cursor, tmp1
329	b.lo	1b
3302:
331
332	/* Fifth and final loop: zero byte per byte */
333.Lzeromem_dczva_final_1byte_aligned:
334	cmp	cursor, stop_address
335	b.eq	2f
3361:
337	strb	wzr, [cursor], #1
338	cmp	cursor, stop_address
339	b.ne	1b
3402:
341	ret
342
343	/* Fallback for unaligned start addresses */
344.Lzeromem_dczva_fallback_entry:
345	/*
346	 * If the start address is already aligned to 16 bytes, skip this loop.
347	 */
348	tst	cursor, #0xf
349	b.eq	.Lzeromem_dczva_final_16bytes_aligned
350
351	/* Calculate the next address aligned to 16 bytes */
352	orr	tmp1, cursor, #15
353	add	tmp1, tmp1, #1
354	/* If it overflows, fallback to byte per byte zeroing */
355	cbz	tmp1, .Lzeromem_dczva_final_1byte_aligned
356	/* If the next aligned address is after the stop address, fall back */
357	cmp	tmp1, stop_address
358	b.hs	.Lzeromem_dczva_final_1byte_aligned
359
360	/* Fallback entry loop: zero byte per byte */
3611:
362	strb	wzr, [cursor], #1
363	cmp	cursor, tmp1
364	b.ne	1b
365
366	b	.Lzeromem_dczva_final_16bytes_aligned
367
368	.unreq	cursor
369	/*
370	 * length is already unreq'ed to reuse the register for another
371	 * variable.
372	 */
373	.unreq	stop_address
374	.unreq	block_size
375	.unreq	block_mask
376	.unreq	tmp1
377	.unreq	tmp2
378endfunc zeromem_dczva
379
380/* --------------------------------------------------------------------------
381 * void memcpy16(void *dest, const void *src, unsigned int length)
382 *
383 * Copy length bytes from memory area src to memory area dest.
384 * The memory areas should not overlap.
385 * Destination and source addresses must be 16-byte aligned.
386 * --------------------------------------------------------------------------
387 */
388func memcpy16
389#if ENABLE_ASSERTIONS
390	orr	x3, x0, x1
391	tst	x3, #0xf
392	ASM_ASSERT(eq)
393#endif
394/* copy 16 bytes at a time */
395m_loop16:
396	cmp	x2, #16
397	b.lo	m_loop1
398	ldp	x3, x4, [x1], #16
399	stp	x3, x4, [x0], #16
400	sub	x2, x2, #16
401	b	m_loop16
402/* copy byte per byte */
403m_loop1:
404	cbz	x2, m_end
405	ldrb	w3, [x1], #1
406	strb	w3, [x0], #1
407	subs	x2, x2, #1
408	b.ne	m_loop1
409m_end:
410	ret
411endfunc memcpy16
412
413/* ---------------------------------------------------------------------------
414 * Disable the MMU at EL3
415 * ---------------------------------------------------------------------------
416 */
417
418func disable_mmu_el3
419	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT)
420do_disable_mmu_el3:
421	mrs	x0, sctlr_el3
422	bic	x0, x0, x1
423	msr	sctlr_el3, x0
424	isb	/* ensure MMU is off */
425	dsb	sy
426	ret
427endfunc disable_mmu_el3
428
429
430func disable_mmu_icache_el3
431	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
432	b	do_disable_mmu_el3
433endfunc disable_mmu_icache_el3
434
435/* ---------------------------------------------------------------------------
436 * Disable the MMU at EL1
437 * ---------------------------------------------------------------------------
438 */
439
440func disable_mmu_el1
441	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT)
442do_disable_mmu_el1:
443	mrs	x0, sctlr_el1
444	bic	x0, x0, x1
445	msr	sctlr_el1, x0
446	isb	/* ensure MMU is off */
447	dsb	sy
448	ret
449endfunc disable_mmu_el1
450
451
452func disable_mmu_icache_el1
453	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
454	b	do_disable_mmu_el1
455endfunc disable_mmu_icache_el1
456
457/* ---------------------------------------------------------------------------
458 * Enable the use of VFP at EL3
459 * ---------------------------------------------------------------------------
460 */
461#if SUPPORT_VFP
462func enable_vfp
463	mrs	x0, cpacr_el1
464	orr	x0, x0, #CPACR_VFP_BITS
465	msr	cpacr_el1, x0
466	mrs	x0, cptr_el3
467	mov	x1, #AARCH64_CPTR_TFP
468	bic	x0, x0, x1
469	msr	cptr_el3, x0
470	isb
471	ret
472endfunc enable_vfp
473#endif
474
475/* ---------------------------------------------------------------------------
476 * Helper to fixup Global Descriptor table (GDT) and dynamic relocations
477 * (.rela.dyn) at runtime.
478 *
479 * This function is meant to be used when the firmware is compiled with -fpie
480 * and linked with -pie options. We rely on the linker script exporting
481 * appropriate markers for start and end of the section. For GOT, we
482 * expect __GOT_START__ and __GOT_END__. Similarly for .rela.dyn, we expect
483 * __RELA_START__ and __RELA_END__.
484 *
485 * The function takes the limits of the memory to apply fixups to as
486 * arguments (which is usually the limits of the relocable BL image).
487 *   x0 -  the start of the fixup region
488 *   x1 -  the limit of the fixup region
489 * These addresses have to be 4KB page aligned.
490 * ---------------------------------------------------------------------------
491 */
492
493/* Relocation codes */
494#define	R_AARCH64_NONE		0
495#define	R_AARCH64_RELATIVE	1027
496
497func fixup_gdt_reloc
498	mov	x6, x0
499	mov	x7, x1
500
501#if ENABLE_ASSERTIONS
502	/* Test if the limits are 4KB aligned */
503	orr	x0, x0, x1
504	tst	x0, #(PAGE_SIZE_MASK)
505	ASM_ASSERT(eq)
506#endif
507	/*
508	 * Calculate the offset based on return address in x30.
509	 * Assume that this function is called within a page at the start of
510	 * fixup region.
511	 */
512	and	x2, x30, #~(PAGE_SIZE_MASK)
513	subs	x0, x2, x6	/* Diff(S) = Current Address - Compiled Address */
514	b.eq	3f		/* Diff(S) = 0. No relocation needed */
515
516	adrp	x1, __GOT_START__
517	add	x1, x1, :lo12:__GOT_START__
518	adrp	x2, __GOT_END__
519	add	x2, x2, :lo12:__GOT_END__
520
521	/*
522	 * GOT is an array of 64_bit addresses which must be fixed up as
523	 * new_addr = old_addr + Diff(S).
524	 * The new_addr is the address currently the binary is executing from
525	 * and old_addr is the address at compile time.
526	 */
5271:	ldr	x3, [x1]
528
529	/* Skip adding offset if address is < lower limit */
530	cmp	x3, x6
531	b.lo	2f
532
533	/* Skip adding offset if address is >= upper limit */
534	cmp	x3, x7
535	b.hs	2f
536	add	x3, x3, x0
537	str	x3, [x1]
538
5392:	add	x1, x1, #8
540	cmp	x1, x2
541	b.lo	1b
542
543	/* Starting dynamic relocations. Use adrp/adr to get RELA_START and END */
5443:	adrp	x1, __RELA_START__
545	add	x1, x1, :lo12:__RELA_START__
546	adrp	x2, __RELA_END__
547	add	x2, x2, :lo12:__RELA_END__
548
549	/*
550	 * According to ELF-64 specification, the RELA data structure is as
551	 * follows:
552	 *	typedef struct {
553	 *		Elf64_Addr r_offset;
554	 *		Elf64_Xword r_info;
555	 *		Elf64_Sxword r_addend;
556	 *	} Elf64_Rela;
557	 *
558	 * r_offset is address of reference
559	 * r_info is symbol index and type of relocation (in this case
560	 * code 1027 which corresponds to R_AARCH64_RELATIVE).
561	 * r_addend is constant part of expression.
562	 *
563	 * Size of Elf64_Rela structure is 24 bytes.
564	 */
565
566	/* Skip R_AARCH64_NONE entry with code 0 */
5671:	ldr	x3, [x1, #8]
568	cbz	x3, 2f
569
570#if ENABLE_ASSERTIONS
571	/* Assert that the relocation type is R_AARCH64_RELATIVE */
572	cmp	x3, #R_AARCH64_RELATIVE
573	ASM_ASSERT(eq)
574#endif
575	ldr	x3, [x1]	/* r_offset */
576	add	x3, x0, x3
577	ldr	x4, [x1, #16]	/* r_addend */
578
579	/* Skip adding offset if r_addend is < lower limit */
580	cmp	x4, x6
581	b.lo	2f
582
583	/* Skip adding offset if r_addend entry is >= upper limit */
584	cmp	x4, x7
585	b.hs	2f
586
587	add	x4, x0, x4	/* Diff(S) + r_addend */
588	str	x4, [x3]
589
5902:	add	x1, x1, #24
591	cmp	x1, x2
592	b.lo	1b
593	ret
594endfunc fixup_gdt_reloc
595