1 // SPDX-License-Identifier: BSD-2-Clause
2 /*
3  * Copyright (c) 2020, Huawei Technologies Co., Ltd
4  */
5 /*
6  * Support for Thread-Local Storage (TLS) ABIs for ARMv7/Aarch32 and Aarch64.
7  *
8  * TAs are currently single-threaded, so the only benefit of implementing these
9  * ABIs is to support toolchains that need them even when the target program is
10  * single-threaded. Such as, the g++ compiler from the GCC toolchain targeting a
11  * "Posix thread" Linux runtime, which OP-TEE has been using for quite some time
12  * (arm-linux-gnueabihf-* and aarch64-linux-gnu-*). This allows building C++ TAs
13  * without having to build a specific toolchain with --disable-threads.
14  *
15  * This implementation is based on [1].
16  *
17  *  - "TLS data structures variant 1" (section 3): the AArch64 compiler uses the
18  *    TPIDR_EL0 to access TLS data directly. This assumes a specific layout for
19  *    the TCB, and (for shared objects) the use of R_AARCH64_TLS_TPREL
20  *    relocations.
21  *  - The "General Dynamic access model" (section 4.1): the ARMv7/Aarch32
22  *    compiler inserts calls to the __tls_get_addr() function which has to be
23  *    implemented by the runtime library. The function takes a module ID and an
24  *    offset parameter, which are provided thanks to R_ARM_TLS_DTPMOD32 and
25  *    R_ARM_TLS_DTPOFF32 relocations.
26  *
27  * In addition, dl_iterate_phdr() is implemented here, because it is used by the
28  * g++ Aarch64 exception handling and it does use the TCB to provide TLS
29  * information to the caller.
30  *
31  * [1] "ELF Handling For Thread-Local Storage"
32  *     https://www.akkadia.org/drepper/tls.pdf
33  */
34 
35 #include <arm64_user_sysreg.h>
36 #include <assert.h>
37 #include <link.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <sys/queue.h>
41 #include "user_ta_header.h"
42 
43 /* DTV - Dynamic Thread Vector
44  *
45  * Maintains an array of pointers to TLS data for each module in the TCB. Each
46  * module that has a TLS segment has an entry (and consequently, some space in
47  * the tcb_head::tls buffer). The index is the "module ID".
48  * dtv[0].size is the number of elements in the vector
49  * dtv[1].tls points to TLS for the main executable (may be NULL)
50  * tls[2 .. (size-1)] are for shared libraries
51  */
52 union dtv {
53 	unsigned long size;
54 	uint8_t *tls;
55 };
56 
57 #define DTV_SIZE(size) (sizeof(union dtv) + (size))
58 
59 /* Thread Control Block */
60 struct tcb_head {
61 	/* Two words are reserved as per the "TLS variant 1" ABI */
62 	union dtv *dtv;
63 	unsigned long reserved;
64 	/*
65 	 * The rest of the structure contains the TLS blocks for each ELF module
66 	 * having a PT_TLS segment. Each block is a copy of the .tdata section
67 	 * plus some zero-initialized space for .tbss.
68 	 */
69 	uint8_t tls[];
70 };
71 
72 /*
73  * Since TAs are single threaded, only one TCB is needed. This would need to
74  * change if multi-threading is introduced.
75  */
76 static struct tcb_head *_tcb;
77 static size_t _tls_size;
78 
79 #define TCB_SIZE(tls_size) (sizeof(*_tcb) + (tls_size))
80 
81 /*
82  * Initialize or update the TCB.
83  * Called on application initialization and when additional shared objects are
84  * loaded via dlopen().
85  */
__utee_tcb_init(void)86 void __utee_tcb_init(void)
87 {
88 	struct dl_phdr_info *dlpi = NULL;
89 	const Elf_Phdr *phdr = NULL;
90 	size_t total_size = 0;
91 	size_t size = 0;
92 	size_t i = 0;
93 	size_t j = 0;
94 
95 	/* Compute the size needed for all the TLS blocks */
96 	for (i = 0; i < __elf_phdr_info.count; i++) {
97 		dlpi = __elf_phdr_info.dlpi + i;
98 		for (j = 0; j < dlpi->dlpi_phnum; j++) {
99 			phdr = dlpi->dlpi_phdr + j;
100 			if (phdr->p_type == PT_TLS) {
101 				total_size += phdr->p_memsz;
102 				break;
103 			}
104 		}
105 	}
106 
107 	/* ELF modules currently cannot be unmapped */
108 	assert(total_size >= _tls_size);
109 
110 	if (total_size == _tls_size)
111 		return;
112 
113 	/* (Re-)allocate the TCB */
114 	_tcb = realloc(_tcb, TCB_SIZE(total_size));
115 	if (!_tcb) {
116 		EMSG("TCB allocation failed (%zu bytes)", TCB_SIZE(total_size));
117 		abort();
118 	}
119 
120 	/* (Re-)allocate the DTV. + 1 since dtv[0] holds the size */
121 	size = DTV_SIZE((__elf_phdr_info.count + 1) * sizeof(union dtv));
122 	_tcb->dtv = realloc(_tcb->dtv, size);
123 	if (!_tcb->dtv) {
124 		EMSG("DTV allocation failed (%zu bytes)", size);
125 		abort();
126 	}
127 
128 	/* Copy TLS data to the TCB */
129 	size = 0;
130 	for (i = 0; i < __elf_phdr_info.count; i++) {
131 		dlpi = __elf_phdr_info.dlpi + i;
132 		for (j = 0; j < dlpi->dlpi_phnum; j++) {
133 			phdr = dlpi->dlpi_phdr + j;
134 			if (phdr->p_type != PT_TLS)
135 				continue;
136 			if (size + phdr->p_memsz <= _tls_size) {
137 				/* Already copied */
138 				break;
139 			}
140 			_tcb->dtv[i + 1].tls = _tcb->tls + size;
141 			/* Copy .tdata */
142 			memcpy(_tcb->tls + size,
143 			       (void *)(dlpi->dlpi_addr + phdr->p_vaddr),
144 			       phdr->p_filesz);
145 			/* Initialize .tbss */
146 			memset(_tcb->tls + size + phdr->p_filesz, 0,
147 			       phdr->p_memsz - phdr->p_filesz);
148 			size += phdr->p_memsz;
149 		}
150 	}
151 	_tcb->dtv[0].size = i;
152 
153 	_tls_size = total_size;
154 #ifdef ARM64
155 	/*
156 	 * Aarch64 ABI requirement: the thread pointer shall point to the
157 	 * thread's TCB. ARMv7 and Aarch32 access the TCB via _tls_get_addr().
158 	 */
159 	write_tpidr_el0((vaddr_t)_tcb);
160 #endif
161 }
162 
163 struct tls_index {
164 	unsigned long module;
165 	unsigned long offset;
166 };
167 
168 void *__tls_get_addr(struct tls_index *ti);
169 
__tls_get_addr(struct tls_index * ti)170 void *__tls_get_addr(struct tls_index *ti)
171 {
172 	return _tcb->dtv[ti->module].tls + ti->offset;
173 }
174 
dl_iterate_phdr(int (* callback)(struct dl_phdr_info *,size_t,void *),void * data)175 int dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *),
176 		    void *data)
177 {
178 	struct dl_phdr_info *dlpi = NULL;
179 	size_t id = 0;
180 	size_t i = 0;
181 	int st = 0;
182 
183 	/*
184 	 * dlpi_tls_data is thread-specific so if we were to support
185 	 * multi-threading, we would need one copy of struct dl_phdr_info per
186 	 * thread. Could be a pre-allocated area, or could be allocated on the
187 	 * heap. Doing the latter here so that it would at least work if/when we
188 	 * add thread support. Further optimization can always come later.
189 	 */
190 	dlpi = calloc(1, sizeof(*dlpi));
191 	if (!dlpi) {
192 		EMSG("dl_phdr_info allocation failed");
193 		abort();
194 	}
195 
196 	for (i = 0; i < __elf_phdr_info.count; i++) {
197 		memcpy(dlpi, __elf_phdr_info.dlpi + i, sizeof(*dlpi));
198 		dlpi->dlpi_tls_data = NULL;
199 		id = dlpi->dlpi_tls_modid;
200 		if (id)
201 			dlpi->dlpi_tls_data = _tcb->dtv[id].tls;
202 		st = callback(dlpi, sizeof(*dlpi), data);
203 	}
204 
205 	free(dlpi);
206 	return st;
207 }
208