1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
|
// SPDX-License-Identifier: BSD-2-Clause
/*
* Copyright (c) 2020, Huawei Technologies Co., Ltd
*/
/*
* Support for Thread-Local Storage (TLS) ABIs for ARMv7/Aarch32 and Aarch64.
*
* TAs are currently single-threaded, so the only benefit of implementing these
* ABIs is to support toolchains that need them even when the target program is
* single-threaded. Such as, the g++ compiler from the GCC toolchain targeting a
* "Posix thread" Linux runtime, which OP-TEE has been using for quite some time
* (arm-linux-gnueabihf-* and aarch64-linux-gnu-*). This allows building C++ TAs
* without having to build a specific toolchain with --disable-threads.
*
* This implementation is based on [1].
*
* - "TLS data structures variant 1" (section 3): the AArch64 compiler uses the
* TPIDR_EL0 to access TLS data directly. This assumes a specific layout for
* the TCB, and (for shared objects) the use of R_AARCH64_TLS_TPREL
* relocations.
* - The "General Dynamic access model" (section 4.1): the ARMv7/Aarch32
* compiler inserts calls to the __tls_get_addr() function which has to be
* implemented by the runtime library. The function takes a module ID and an
* offset parameter, which are provided thanks to R_ARM_TLS_DTPMOD32 and
* R_ARM_TLS_DTPOFF32 relocations.
*
* In addition, dl_iterate_phdr() is implemented here, because it is used by the
* g++ Aarch64 exception handling and it does use the TCB to provide TLS
* information to the caller.
*
* [1] "ELF Handling For Thread-Local Storage"
* https://www.akkadia.org/drepper/tls.pdf
*/
#include <arm64_user_sysreg.h>
#include <assert.h>
#include <link.h>
#include <stdlib.h>
#include <string.h>
#include <sys/queue.h>
#include "user_ta_header.h"
/* DTV - Dynamic Thread Vector
*
* Maintains an array of pointers to TLS data for each module in the TCB. Each
* module that has a TLS segment has an entry (and consequently, some space in
* the tcb_head::tls buffer). The index is the "module ID".
* dtv[0].size is the number of elements in the vector
* dtv[1].tls points to TLS for the main executable (may be NULL)
* tls[2 .. (size-1)] are for shared libraries
*/
union dtv {
unsigned long size;
uint8_t *tls;
};
#define DTV_SIZE(size) (sizeof(union dtv) + (size))
/* Thread Control Block */
struct tcb_head {
/* Two words are reserved as per the "TLS variant 1" ABI */
union dtv *dtv;
unsigned long reserved;
/*
* The rest of the structure contains the TLS blocks for each ELF module
* having a PT_TLS segment. Each block is a copy of the .tdata section
* plus some zero-initialized space for .tbss.
*/
uint8_t tls[];
};
/*
* Since TAs are single threaded, only one TCB is needed. This would need to
* change if multi-threading is introduced.
*/
static struct tcb_head *_tcb;
static size_t _tls_size;
#define TCB_SIZE(tls_size) (sizeof(*_tcb) + (tls_size))
/*
* Initialize or update the TCB.
* Called on application initialization and when additional shared objects are
* loaded via dlopen().
*/
void __utee_tcb_init(void)
{
struct dl_phdr_info *dlpi = NULL;
const Elf_Phdr *phdr = NULL;
size_t total_size = 0;
size_t size = 0;
size_t i = 0;
size_t j = 0;
/* Compute the size needed for all the TLS blocks */
for (i = 0; i < __elf_phdr_info.count; i++) {
dlpi = __elf_phdr_info.dlpi + i;
for (j = 0; j < dlpi->dlpi_phnum; j++) {
phdr = dlpi->dlpi_phdr + j;
if (phdr->p_type == PT_TLS) {
total_size += phdr->p_memsz;
break;
}
}
}
/* ELF modules currently cannot be unmapped */
assert(total_size >= _tls_size);
if (total_size == _tls_size)
return;
/* (Re-)allocate the TCB */
_tcb = malloc_flags(MAF_ZERO_INIT, _tcb, 1, TCB_SIZE(total_size));
if (!_tcb) {
EMSG("TCB allocation failed (%zu bytes)", TCB_SIZE(total_size));
abort();
}
/* (Re-)allocate the DTV. + 1 since dtv[0] holds the size */
size = DTV_SIZE((__elf_phdr_info.count + 1) * sizeof(union dtv));
_tcb->dtv = malloc_flags(MAF_ZERO_INIT, _tcb->dtv, 1, size);
if (!_tcb->dtv) {
EMSG("DTV allocation failed (%zu bytes)", size);
abort();
}
/* Copy TLS data to the TCB */
size = 0;
for (i = 0; i < __elf_phdr_info.count; i++) {
dlpi = __elf_phdr_info.dlpi + i;
for (j = 0; j < dlpi->dlpi_phnum; j++) {
phdr = dlpi->dlpi_phdr + j;
if (phdr->p_type != PT_TLS)
continue;
if (size + phdr->p_memsz <= _tls_size) {
/* Already copied */
break;
}
_tcb->dtv[i + 1].tls = _tcb->tls + size;
/* Copy .tdata */
memcpy(_tcb->tls + size,
(void *)(dlpi->dlpi_addr + phdr->p_vaddr),
phdr->p_filesz);
/* Initialize .tbss */
memset(_tcb->tls + size + phdr->p_filesz, 0,
phdr->p_memsz - phdr->p_filesz);
size += phdr->p_memsz;
}
}
_tcb->dtv[0].size = i;
_tls_size = total_size;
#ifdef ARM64
/*
* Aarch64 ABI requirement: the thread pointer shall point to the
* thread's TCB. ARMv7 and Aarch32 access the TCB via _tls_get_addr().
*/
write_tpidr_el0((vaddr_t)_tcb);
#endif
}
struct tls_index {
unsigned long module;
unsigned long offset;
};
void *__tls_get_addr(struct tls_index *ti);
void *__tls_get_addr(struct tls_index *ti)
{
return _tcb->dtv[ti->module].tls + ti->offset;
}
int dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *),
void *data)
{
struct dl_phdr_info *dlpi = NULL;
size_t id = 0;
size_t i = 0;
int st = 0;
/*
* dlpi_tls_data is thread-specific so if we were to support
* multi-threading, we would need one copy of struct dl_phdr_info per
* thread. Could be a pre-allocated area, or could be allocated on the
* heap. Doing the latter here so that it would at least work if/when we
* add thread support. Further optimization can always come later.
*/
dlpi = calloc(1, sizeof(*dlpi));
if (!dlpi) {
EMSG("dl_phdr_info allocation failed");
abort();
}
for (i = 0; i < __elf_phdr_info.count; i++) {
memcpy(dlpi, __elf_phdr_info.dlpi + i, sizeof(*dlpi));
dlpi->dlpi_tls_data = NULL;
id = dlpi->dlpi_tls_modid;
if (id)
dlpi->dlpi_tls_data = _tcb->dtv[id].tls;
st = callback(dlpi, sizeof(*dlpi), data);
}
free(dlpi);
return st;
}
|