This patch implements all necessary changes to support static
TLS - both local-exec and initial-exec for the aarch64 port. Please note that 
this effectively allows apps using TLS along with any
'initial-exec'-loaded libraries (NOT dlopen()-ed) run on OSv.
This includes all TLS related unit tests.

Overall the code changes mimic the x64 static-TLS counterparts,
except the layout of the static TLS conforms to the variant I
as described in the Drepper's paper - the blocks go from the left
to the right and the offsets to TP are positive.

Also specifically TLS support on aarch64 requires
handling of so called TLS descriptors which in essence
provide extra redirection based on the resolvers that
when called provide TP offsets of TLS variables. Besides
we also need to handle the R_AARCH64_TLS_TPREL64 relocations. For
more details please see
https://www.fsfla.org/~lxoliva/writeups/TLS/RFC-TLSDESC-ARM.txt.

The version of this patch cleans up some debugging code
that was part of the original patch.

Fixes #1101

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
---
 Makefile                    |  3 +-
 arch/aarch64/arch-elf.cc    | 79 ++++++++++++++++++++++++++++++++++--
 arch/aarch64/arch-elf.hh    |  1 +
 arch/aarch64/arch-switch.hh | 81 +++++++++++++++++++++++++++++++++++--
 arch/aarch64/loader.ld      |  9 +++--
 arch/aarch64/tlsdesc.s      | 10 +++++
 arch/x64/arch-elf.cc        |  5 +++
 arch/x64/arch-elf.hh        |  2 +
 arch/x64/arch-switch.hh     |  4 ++
 core/elf.cc                 | 44 +++++++++++---------
 include/osv/elf.hh          |  4 ++
 scripts/test.py             |  1 -
 12 files changed, 211 insertions(+), 32 deletions(-)
 create mode 100644 arch/aarch64/tlsdesc.s

diff --git a/Makefile b/Makefile
index 41a9776c..21127c3f 100644
--- a/Makefile
+++ b/Makefile
@@ -456,7 +456,7 @@ ifeq ($(arch),aarch64)
 
 kernel_base := 0x40080000
 kernel_vm_base := 0x40080000
-app_local_exec_tls_size := 0x0
+app_local_exec_tls_size := 0x40
 
 include $(libfdt_base)/Makefile.libfdt
 libfdt-source := $(patsubst %.c, $(libfdt_base)/%.c, $(LIBFDT_SRCS))
@@ -866,6 +866,7 @@ objects += arch/$(arch)/hypercall.o
 objects += arch/$(arch)/memset.o
 objects += arch/$(arch)/memcpy.o
 objects += arch/$(arch)/memmove.o
+objects += arch/$(arch)/tlsdesc.o
 objects += $(libfdt)
 endif
 
diff --git a/arch/aarch64/arch-elf.cc b/arch/aarch64/arch-elf.cc
index 72be11d7..5837a59b 100644
--- a/arch/aarch64/arch-elf.cc
+++ b/arch/aarch64/arch-elf.cc
@@ -6,7 +6,9 @@
  */
 
 #include <osv/elf.hh>
+#include <osv/sched.hh>
 
+extern "C" size_t __tlsdesc_static(size_t *);
 namespace elf {
 
 bool arch_init_reloc_dyn(struct init_table *t, u32 type, u32 sym,
@@ -50,7 +52,30 @@ bool object::arch_relocate_rela(u32 type, u32 sym, void 
*addr,
         *static_cast<void**>(addr) = _base + addend;
         break;
     case R_AARCH64_TLS_TPREL64:
-        *static_cast<void**>(addr) = symbol(sym).relocated_addr() + addend;
+        if (sym) {
+            auto sm = symbol(sym);
+            ulong tls_offset;
+            if (sm.obj->is_executable()) {
+                // If this is an executable (pie or position-dependant one)
+                // then the variable is located in the reserved slot of the TLS
+                // right where the kernel TLS lives
+                // So the offset is 0 right at the start of the static TLS
+                tls_offset = 0;
+            } else {
+                // If shared library, the variable is located in one of TLS
+                // blocks that are part of the static TLS after kernel part
+                // so the offset needs to shift by sum of kernel and size of 
the user static
+                // TLS so far
+                sm.obj->alloc_static_tls();
+                tls_offset = sm.obj->static_tls_offset() + 
sched::kernel_tls_size();
+            }
+            *static_cast<u64*>(addr) = sm.symbol->st_value + addend + 
tls_offset + sizeof(thread_control_block);
+        }
+        else {
+           alloc_static_tls();
+           ulong tls_offset = _static_tls_offset + sched::kernel_tls_size();
+            *static_cast<u64*>(addr) = addend + tls_offset + 
sizeof(thread_control_block);
+        }
         break;
     default:
         return false;
@@ -69,15 +94,63 @@ bool object::arch_relocate_jump_slot(symbol_module& sym, 
void *addr, Elf64_Sxwor
     }
 }
 
+void object::arch_relocate_tls_desc(symbol_module& sym, void *addr, 
Elf64_Sxword addend)
+{
+    //TODO: Differentiate between DL_NEEDED (static TLS, initial-exec) and 
dynamic TLS (dlopen)
+    //For now assume it is always static TLS case
+    //
+    // First place the address of the resolver function - __tlsdesc_static
+    *static_cast<size_t*>(addr) = (size_t)__tlsdesc_static;
+    // Secondly calculate and store the argument passed to the resolver 
function - TLS offset
+    ulong tls_offset;
+    if (sym.obj->is_executable()) {
+        // If this is an executable (pie or position-dependant one)
+        // then the variable is located in the reserved slot of the TLS
+        // right where the kernel TLS lives
+        // So the offset is 0 right at the start of the static TLS
+        tls_offset = 0;
+    } else {
+        // If shared library, the variable is located in one of TLS
+        // blocks that are part of the static TLS after kernel part
+        // so the offset needs to shift by sum of kernel and size of the user 
static
+        // TLS so far
+        sym.obj->alloc_static_tls();
+        tls_offset = sym.obj->static_tls_offset() + sched::kernel_tls_size();
+    }
+    auto offset = (size_t)sym.symbol->st_value + addend + tls_offset + 
sizeof(thread_control_block);
+    *(static_cast<size_t*>(addr) + 1) = offset;
+}
+
 void object::prepare_initial_tls(void* buffer, size_t size,
                                  std::vector<ptrdiff_t>& offsets)
 {
-    abort();
+    if (!_static_tls) {
+        return;
+    }
+
+    auto offset = _static_tls_offset;
+    auto ptr = static_cast<char*>(buffer) + offset;
+    memcpy(ptr, _tls_segment, _tls_init_size);
+    memset(ptr + _tls_init_size, 0, _tls_uninit_size);
+
+    offsets.resize(std::max(_module_index + 1, offsets.size()));
+    offsets[_module_index] = offset;
 }
 
 void object::prepare_local_tls(std::vector<ptrdiff_t>& offsets)
 {
-    abort();
+    if (!_static_tls && !is_executable()) {
+        return;
+    }
+
+    offsets.resize(std::max(_module_index + 1, offsets.size()));
+    offsets[_module_index] = 0;
+}
+
+void object::copy_local_tls(void* to_addr)
+{
+    memcpy(to_addr, _tls_segment, _tls_init_size);
+    memset(to_addr + _tls_init_size, 0, _tls_uninit_size);
 }
 
 }
diff --git a/arch/aarch64/arch-elf.hh b/arch/aarch64/arch-elf.hh
index e1ac00a5..90bcc887 100644
--- a/arch/aarch64/arch-elf.hh
+++ b/arch/aarch64/arch-elf.hh
@@ -18,5 +18,6 @@ enum {
 
 /* for pltgot relocation */
 #define ARCH_JUMP_SLOT R_AARCH64_JUMP_SLOT
+#define ARCH_TLSDESC R_AARCH64_TLSDESC
 
 #endif /* ARCH_ELF_HH */
diff --git a/arch/aarch64/arch-switch.hh b/arch/aarch64/arch-switch.hh
index 8775535d..dff7467c 100644
--- a/arch/aarch64/arch-switch.hh
+++ b/arch/aarch64/arch-switch.hh
@@ -87,14 +87,87 @@ void thread::init_stack()
 }
 
 void thread::setup_tcb()
-{
+{   //
+    // Most importantly this method allocates TLS memory region and
+    // sets up TCB (Thread Control Block) that points to that allocated
+    // memory region. The TLS memory region is designated to a specific thread
+    // and holds thread local variables (with __thread modifier) defined
+    // in OSv kernel and the application ELF objects including dependant ones
+    // through DT_NEEDED tag.
+    //
+    // Each ELF object and OSv kernel gets its own TLS block with offsets
+    // specified in DTV structure (the offsets get calculated as ELF is loaded 
and symbols
+    // resolved before we get to this point).
+    //
+    // Because both OSv kernel and position-in-dependant (pie) or 
position-dependant
+    // executable (non library) are compiled to use local-exec mode to access 
the thread
+    // local variables, we need to setup the offsets and TLS blocks in a 
special way
+    // to avoid any collisions. Specifically we define OSv TLS segment
+    // (see arch/aarch64/loader.ld for specifics) with an extra buffer at
+    // the beginning of the kernel TLS to accommodate TLS block of pies and
+    // position-dependant executables.
+    //
+    // Please note that the TLS layout conforms to the variant I (1),
+    // which means for example that all variable offsets are positive.
+    // It also means that individual objects are laid out from the left to the 
right.
+
+    // (1) - TLS memory area layout with app shared library
+    // |------|--------------|-----|-----|-----|
+    // |<NONE>|KERNEL        |SO_1 |SO_2 |SO_3 |
+    // |------|--------------|-----|-----|-----|
+
+    // (2) - TLS memory area layout with pie or
+    // position dependant executable
+    // |------|--------------|-----|-----|
+    // | EXE  |KERNEL        |SO_2 |SO_3 |
+    // |------|--------------|-----|-----|
+
     assert(tls.size);
-    void* p = malloc(sched::tls.size + 1024);
-    memset(p, 0, sched::tls.size + 1024);
 
+    void* user_tls_data;
+    size_t user_tls_size = 0;
+    size_t executable_tls_size = 0;
+    if (_app_runtime) {
+        auto obj = _app_runtime->app.lib();
+        assert(obj);
+        user_tls_size = obj->initial_tls_size();
+        user_tls_data = obj->initial_tls();
+        if (obj->is_executable()) {
+           executable_tls_size = obj->get_tls_size();
+        }
+    }
+
+    // In arch/aarch64/loader.ld, the TLS template segment is aligned to 64
+    // bytes, and that's what the objects placed in it assume. So make
+    // sure our copy is allocated with the same 64-byte alignment, and
+    // verify that object::init_static_tls() ensured that user_tls_size
+    // also doesn't break this alignment.
+    auto kernel_tls_size = sched::tls.size;
+    assert(align_check(kernel_tls_size, (size_t)64));
+    assert(align_check(user_tls_size, (size_t)64));
+
+    auto total_tls_size = kernel_tls_size + user_tls_size;
+    void* p = aligned_alloc(64, total_tls_size + sizeof(*_tcb));
     _tcb = (thread_control_block *)p;
     _tcb[0].tls_base = &_tcb[1];
-    memcpy(&_tcb[1], sched::tls.start, sched::tls.filesize);
+    //
+    // First goes kernel TLS data
+    auto kernel_tls = _tcb[0].tls_base;
+    memcpy(kernel_tls, sched::tls.start, sched::tls.filesize);
+    memset(kernel_tls + sched::tls.filesize, 0,
+           kernel_tls_size - sched::tls.filesize);
+    //
+    // Next goes user TLS data
+    if (user_tls_size) {
+        memcpy(kernel_tls + kernel_tls_size, user_tls_data, user_tls_size);
+    }
+
+    if (executable_tls_size) {
+        // If executable, then copy its TLS block data at the designated offset
+        // at the beginning of the area as described in the ascii art for 
executables
+        // TLS layout
+        _app_runtime->app.lib()->copy_local_tls(kernel_tls);
+    }
 }
 
 void thread::free_tcb()
diff --git a/arch/aarch64/loader.ld b/arch/aarch64/loader.ld
index a02e52b7..a0656d23 100644
--- a/arch/aarch64/loader.ld
+++ b/arch/aarch64/loader.ld
@@ -105,17 +105,18 @@ SECTIONS
     }
     /* do not align tdata, tbss with .tdata : ALIGN (64),
        or the linker will offset the TLS loads accordingly! */
-    .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } : tls : text
-    .tbss : {
-        *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
+    .tdata : {
         _pie_static_tls_start = .;
         /* This is a reserve intended for executables' (pie or non-pie) TLS 
block */
         . = . + APP_LOCAL_EXEC_TLS_SIZE;
         _pie_static_tls_end = .;
+        *(.tdata .tdata.* .gnu.linkonce.td.*)
+    } : tls : text
+    .tbss : {
+        *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
     } : tls : text
     .tls_template_size = SIZEOF(.tdata) + SIZEOF(.tbss);
     .bss : { *(.dynbss .bss .bss.* .gnu.linkonce.b.*) } : text
-
     . = ALIGN(64);
     tcb0 = .;
     . = . + .tls_template_size + 1024;
diff --git a/arch/aarch64/tlsdesc.s b/arch/aarch64/tlsdesc.s
new file mode 100644
index 00000000..98fff77a
--- /dev/null
+++ b/arch/aarch64/tlsdesc.s
@@ -0,0 +1,10 @@
+// size_t __tlsdesc_static(size_t *a)
+// {
+//     return a[1];
+// }
+.global __tlsdesc_static
+.hidden __tlsdesc_static
+.type __tlsdesc_static,@function
+__tlsdesc_static:
+       ldr x0,[x0,#8]
+       ret
diff --git a/arch/x64/arch-elf.cc b/arch/x64/arch-elf.cc
index edc81291..321ef6bf 100644
--- a/arch/x64/arch-elf.cc
+++ b/arch/x64/arch-elf.cc
@@ -166,6 +166,11 @@ bool object::arch_relocate_jump_slot(symbol_module& sym, 
void *addr, Elf64_Sxwor
     }
 }
 
+void object::arch_relocate_tls_desc(symbol_module& sym, void *addr, 
Elf64_Sxword addend)
+{
+    abort("Not implemented!");
+}
+
 void object::prepare_initial_tls(void* buffer, size_t size,
                                  std::vector<ptrdiff_t>& offsets)
 {
diff --git a/arch/x64/arch-elf.hh b/arch/x64/arch-elf.hh
index 1811ceb5..de854bbc 100644
--- a/arch/x64/arch-elf.hh
+++ b/arch/x64/arch-elf.hh
@@ -31,10 +31,12 @@ enum {
     R_X86_64_GOTPC32 = 26, //  word32 GOT + A - P
     R_X86_64_SIZE32 = 32, //  word32 Z + A
     R_X86_64_SIZE64 = 33, //  word64 Z + A
+    R_X86_64_TLSDESC = 36,
     R_X86_64_IRELATIVE = 37, //  word64 indirect(B + A)
 };
 
 /* for pltgot relocation */
 #define ARCH_JUMP_SLOT R_X86_64_JUMP_SLOT
+#define ARCH_TLSDESC R_X86_64_TLSDESC
 
 #endif /* ARCH_ELF_HH */
diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
index 6803498f..ee8b7e99 100644
--- a/arch/x64/arch-switch.hh
+++ b/arch/x64/arch-switch.hh
@@ -170,6 +170,10 @@ void thread::setup_tcb()
     // (see arch/x64/loader.ld for specifics) with an extra buffer at
     // the end of the kernel TLS to accommodate TLS block of pies and
     // position-dependant executables.
+    //
+    // Please note that the TLS layout conforms to the variant II (2),
+    // which means for example that all variable offsets are negative.
+    // It also means that individual objects are laid out from the right to 
the left.
 
     // (1) - TLS memory area layout with app shared library
     // |-----|-----|-----|--------------|------|
diff --git a/core/elf.cc b/core/elf.cc
index 0d33f63d..e4263db2 100644
--- a/core/elf.cc
+++ b/core/elf.cc
@@ -786,29 +786,35 @@ void object::relocate_pltgot()
     for (auto p = rel; p < rel + nrel; ++p) {
         auto info = p->r_info;
         u32 type = info & 0xffffffff;
-        assert(type == ARCH_JUMP_SLOT);
         void *addr = _base + p->r_offset;
-        if (bind_now) {
-            // If on-load binding is requested (instead of the default lazy
-            // binding), try to resolve all the PLT entries now.
-            // If symbol cannot be resolved warn about it instead of aborting
-            u32 sym = info >> 32;
-            auto _sym = symbol(sym, true);
-            if (arch_relocate_jump_slot(_sym, addr, p->r_addend))
-                  continue;
-        }
-        if (original_plt) {
-            // Restore the link to the original plt.
-            // We know the JUMP_SLOT entries are in plt order, and that
-            // each plt entry is 16 bytes.
-            *static_cast<void**>(addr) = original_plt + (p-rel)*16;
+        assert(type == ARCH_JUMP_SLOT || type == ARCH_TLSDESC);
+        if (type == ARCH_JUMP_SLOT) {
+            if (bind_now) {
+                // If on-load binding is requested (instead of the default lazy
+                // binding), try to resolve all the PLT entries now.
+                // If symbol cannot be resolved warn about it instead of 
aborting
+                u32 sym = info >> 32;
+                auto _sym = symbol(sym, true);
+                if (arch_relocate_jump_slot(_sym, addr, p->r_addend))
+                    continue;
+            }
+            if (original_plt) {
+                // Restore the link to the original plt.
+                // We know the JUMP_SLOT entries are in plt order, and that
+                // each plt entry is 16 bytes.
+                *static_cast<void**>(addr) = original_plt + (p-rel)*16;
+            } else {
+                // The JUMP_SLOT entry already points back to the PLT, just
+                // make sure it is relocated relative to the object base.
+                *static_cast<u64*>(addr) += reinterpret_cast<u64>(_base);
+            }
         } else {
-            // The JUMP_SLOT entry already points back to the PLT, just
-            // make sure it is relocated relative to the object base.
-            *static_cast<u64*>(addr) += reinterpret_cast<u64>(_base);
+            u32 sym = info >> 32;
+            auto _sym = symbol(sym, false);
+            arch_relocate_tls_desc(_sym, addr, p->r_addend);
         }
     }
-    elf_debug("Relocated %d PLT symbols in DT_JMPREL\n", nrel);
+    elf_debug("Relocated %d PLT symbols\n", nrel);
 
     // PLTGOT resolution has a special calling convention,
     // for x64 the symbol index and some word is pushed on the stack,
diff --git a/include/osv/elf.hh b/include/osv/elf.hh
index 8f8db68a..2050f766 100644
--- a/include/osv/elf.hh
+++ b/include/osv/elf.hh
@@ -202,6 +202,8 @@ enum {
       // processor-specific use.
     DT_HIPROC = 0x7FFFFFFF, //
     DT_GNU_HASH = 0x6ffffef5,
+    DT_TLSDESC_PLT = 0x6ffffef6,
+    DT_TLSDESC_GOT = 0x6ffffef7,
 };
 
 enum {
@@ -454,12 +456,14 @@ protected:
     bool arch_relocate_rela(u32 type, u32 sym, void *addr,
                             Elf64_Sxword addend);
     bool arch_relocate_jump_slot(symbol_module& sym, void *addr, Elf64_Sxword 
addend);
+    void arch_relocate_tls_desc(symbol_module& sym, void *addr, Elf64_Sxword 
addend);
     size_t static_tls_end() {
         if (is_core() || is_executable()) {
             return 0;
         }
         return _static_tls_offset + get_tls_size();
     }
+    size_t static_tls_offset() { return _static_tls_offset; }
 private:
     std::atomic<void*> _visibility_thread;
     std::atomic<VisibilityLevel> _visibility_level;
diff --git a/scripts/test.py b/scripts/test.py
index c0620596..5128f41d 100755
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -61,7 +61,6 @@ aarch64_blacklist= [
     "tst-sampler.so",              # Crashes with 'failed looking up symbol 
_ZN4prof13start_samplerENS_6configE (prof::start_sampler(prof::config))'
     "tst-semaphore.so",            # Seems to hang after 'Thread *: 
Incremented 1th' messages
     "tst-stdio-rofs.so",           # One assertion fails - 
'tst-stdio.cc(1922): fatal error: in 
"STDIO_TEST_fread_unbuffered_pathological_performance": critical check (t1 - 
t0) <= (1) has failed'
-    "tst-thread-local.so",         # Crashes due to missing TLS support
     "tst-time.so",                 # One assertion fails - 'tst-time.cc(70): 
fatal error: in "time_time": critical check (static_cast<time_t>(0)) != (t1) 
has failed'
     "tst-timerfd.so",              # Some assertions fail - 'SUMMARY: 212 
tests, 10 failures'
     "tst-yield.so",                # Seems to hang
-- 
2.26.2

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/20201018164200.129282-1-jwkozaczuk%40gmail.com.

Reply via email to