musl/ldso/dynlink.c

2314 lines
63 KiB
C
Raw Normal View History

#define _GNU_SOURCE
overhaul i386 syscall mechanism not to depend on external asm source this is the first part of a series of patches intended to make __syscall fully self-contained in the object file produced using syscall.h, which will make it possible for crt1 code to perform syscalls. the (confusingly named) i386 __vsyscall mechanism, which this commit removes, was introduced before the presence of a valid thread pointer was mandatory; back then the thread pointer was setup lazily only if threads were used. the intent was to be able to perform syscalls using the kernel's fast entry point in the VDSO, which can use the sysenter (Intel) or syscall (AMD) instruction instead of int $128, but without inlining an access to the __syscall global at the point of each syscall, which would incur a significant size cost from PIC setup everywhere. the mechanism also shuffled registers/calling convention around to avoid spills of call-saved registers, and to avoid allocating ebx or ebp via asm constraints, since there are plenty of broken-but-supported compiler versions which are incapable of allocating ebx with -fPIC or ebp with -fno-omit-frame-pointer. the new mechanism preserves the properties of avoiding spills and avoiding allocation of ebx/ebp in constraints, but does it inline, using some fairly simple register shuffling, and uses a field of the thread structure rather than global data for the vdso-provided syscall code address. for now, the external __syscall function is refactored not to use the old __vsyscall so it can be kept, but the intent is to remove it too.
2019-04-11 00:10:36 +03:00
#define SYSCALL_NO_TLS 1
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stddef.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include <elf.h>
#include <sys/mman.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <errno.h>
#include <link.h>
#include <setjmp.h>
#include <pthread.h>
2011-08-16 08:24:36 +04:00
#include <ctype.h>
#include <dlfcn.h>
install dynamic tls synchronously at dlopen, streamline access previously, dynamic loading of new libraries with thread-local storage allocated the storage needed for all existing threads at load-time, precluding late failure that can't be handled, but left installation in existing threads to take place lazily on first access. this imposed an additional memory access and branch on every dynamic tls access, and imposed a requirement, which was not actually met, that the dynamic tlsdesc asm functions preserve all call-clobbered registers before calling C code to to install new dynamic tls on first access. the x86[_64] versions of this code wrongly omitted saving and restoring of fpu/vector registers, assuming the compiler would not generate anything using them in the called C code. the arm and aarch64 versions saved known existing registers, but failed to be future-proof against expansion of the register file. now that we track live threads in a list, it's possible to install the new dynamic tls for each thread at dlopen time. for the most part, synchronization is not needed, because if a thread has not synchronized with completion of the dlopen, there is no way it can meaningfully request access to a slot past the end of the old dtv, which remains valid for accessing slots which already existed. however, it is necessary to ensure that, if a thread sees its new dtv pointer, it sees correct pointers in each of the slots that existed prior to the dlopen. my understanding is that, on most real-world coherency architectures including all the ones we presently support, a built-in consume order guarantees this; however, don't rely on that. instead, the SYS_membarrier syscall is used to ensure that all threads see the stores to the slots of their new dtv prior to the installation of the new dtv. if it is not supported, the same is implemented in userspace via signals, using the same mechanism as __synccall. the __tls_get_addr function, variants, and dynamic tlsdesc asm functions are all updated to remove the fallback paths for claiming new dynamic tls, and are now all branch-free.
2019-02-18 07:22:27 +03:00
#include <semaphore.h>
#include <sys/membarrier.h>
#include "pthread_impl.h"
#include "libc.h"
#include "dynlink.h"
static void error(const char *, ...);
#define MAXP2(a,b) (-(-(a)&-(b)))
#define ALIGN(x,y) ((x)+(y)-1 & -(y))
#define container_of(p,t,m) ((t*)((char *)(p)-offsetof(t,m)))
#define countof(a) ((sizeof (a))/(sizeof (a)[0]))
struct debug {
int ver;
void *head;
void (*bp)(void);
int state;
void *base;
};
struct td_index {
size_t args[2];
struct td_index *next;
};
struct dso {
#if DL_FDPIC
struct fdpic_loadmap *loadmap;
#else
unsigned char *base;
#endif
char *name;
size_t *dynv;
struct dso *next, *prev;
Phdr *phdr;
int phnum;
size_t phentsize;
Sym *syms;
Elf_Symndx *hashtab;
uint32_t *ghashtab;
int16_t *versym;
char *strings;
struct dso *syms_next, *lazy_next;
size_t *lazy, lazy_cnt;
unsigned char *map;
size_t map_len;
dev_t dev;
ino_t ino;
char relocated;
char constructed;
char kernel_mapped;
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
char mark;
char bfs_built;
char runtime_loaded;
struct dso **deps, *needed_by;
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
size_t ndeps_direct;
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
size_t next_dep;
int ctor_visitor;
char *rpath_orig, *rpath;
struct tls_module tls;
size_t tls_id;
2014-03-25 16:13:27 +04:00
size_t relro_start, relro_end;
uintptr_t *new_dtv;
unsigned char *new_tls;
struct td_index *td_index;
struct dso *fini_next;
char *shortname;
#if DL_FDPIC
unsigned char *base;
#else
struct fdpic_loadmap *loadmap;
#endif
struct funcdesc {
void *addr;
size_t *got;
} *funcdescs;
size_t *got;
char buf[];
};
struct symdef {
Sym *sym;
struct dso *dso;
};
typedef void (*stage3_func)(size_t *, size_t *);
static struct builtin_tls {
char c;
struct pthread pt;
void *space[16];
} builtin_tls[1];
#define MIN_TLS_ALIGN offsetof(struct builtin_tls, pt)
reprocess all libc/ldso symbolic relocations in dynamic linking stage 3 commit f3ddd173806fd5c60b3f034528ca24542aecc5b9 introduced early relocations and subsequent reprocessing as part of the dynamic linker bootstrap overhaul, to allow use of arbitrary libc functions before the main application and libraries are loaded, but only reprocessed GOT/PLT relocation types. commit c093e2e8201524db0d638920e76bcb6b1d925f3a added reprocessing of non-GOT/PLT relocations to fix an actual regression that was observed on powerpc, but only for RELA format tables with out-of-line addends. REL table (inline addends at the relocation address) reprocessing is trickier because the first relocation pass clobbers the addends. this patch extends symbolic relocation reprocessing for libc/ldso to support all relocation types, whether REL or RELA format tables are used. it is believed not to alter behavior on any existing archs for the current dynamic linker and libc code. the motivations for this change are consistency and future-proofing. it ensures that behavior does not differ depending on whether REL or RELA tables are used, which could lead to undetected arch-specific bugs. it also ensures that, if in the future code depending on additional relocation types is added to libc.so, either at the source level or as part of the compiler runtime that gets pulled in (for example, soft-float with TLS for fenv), the new code will work properly. the implementation concept is simple: stage 2 of the dynamic linker counts the number of symbolic relocations in the libc/ldso REL table and allocates a VLA to save their addends into; stage 3 then uses the saved addends in place of the inline ones which were clobbered. for stack safety, a hard limit (currently 4k) is imposed on the number of such addends; this should be a couple orders of magnitude larger than the actual need. this number is not a runtime variable that could break fail-safety; it is constant for a given libc.so build.
2015-05-26 06:33:59 +03:00
#define ADDEND_LIMIT 4096
static size_t *saved_addends, *apply_addends_to;
static struct dso ldso;
static struct dso *head, *tail, *fini_head, *syms_tail, *lazy_head;
static char *env_path, *sys_path;
static unsigned long long gencnt;
static int runtime;
static int ldd_mode;
static int ldso_fail;
static int noload;
static int shutting_down;
static jmp_buf *rtld_fail;
static pthread_rwlock_t lock;
static struct debug debug;
static struct tls_module *tls_tail;
static size_t tls_cnt, tls_offset, tls_align = MIN_TLS_ALIGN;
static size_t static_tls_cnt;
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
static pthread_mutex_t init_fini_lock;
static pthread_cond_t ctor_cond;
static struct dso *builtin_deps[2];
static struct dso *const no_deps[1];
static struct dso *builtin_ctor_queue[4];
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
static struct dso **main_ctor_queue;
static struct fdpic_loadmap *app_loadmap;
static struct fdpic_dummy_loadmap app_dummy_loadmap;
struct debug *_dl_debug_addr = &debug;
extern hidden int __malloc_replaced;
hidden void (*const __init_array_start)(void)=0, (*const __fini_array_start)(void)=0;
remove undef weak refs to init/fini array symbols in libc.so commit ad1cd43a86645ba2d4f7c8747240452a349d6bc1 eliminated preprocessor-level omission of references to the init/fini array symbols from object files going into libc.so. the references are weak, and the intent was that the linker would resolve them to zero in libc.so, but instead it leaves undefined references that could be satisfied at runtime. normally these references would be harmless, since the code using them does not even get executed, but some older binutils versions produce a linking error: when linking a program against libc.so, ld first tries to use the hidden init/fini array symbols produced by the linker script to satisfy the references in libc.so, then produces an error because the definitions are hidden. ideally ld would have already provided definitions of these symbols when linking libc.so, but the linker script for -shared omits them. to avoid this situation, the dynamic linker now provides its own dummy definitions of the init/fini array symbols for libc.so. since they are hidden, everything binds at ld time and no references remain in the dynamic symbol table. with modern binutils and --gc-sections, both the dummy empty array objects and the code referencing them get dropped at link time, anyway. the _init and _fini symbols are also switched back to using weak definitions rather than weak references since the latter behave somewhat problematically in general, and the weak definition approach was known to work well.
2015-11-20 04:28:08 +03:00
extern hidden void (*const __init_array_end)(void), (*const __fini_array_end)(void);
remove undef weak refs to init/fini array symbols in libc.so commit ad1cd43a86645ba2d4f7c8747240452a349d6bc1 eliminated preprocessor-level omission of references to the init/fini array symbols from object files going into libc.so. the references are weak, and the intent was that the linker would resolve them to zero in libc.so, but instead it leaves undefined references that could be satisfied at runtime. normally these references would be harmless, since the code using them does not even get executed, but some older binutils versions produce a linking error: when linking a program against libc.so, ld first tries to use the hidden init/fini array symbols produced by the linker script to satisfy the references in libc.so, then produces an error because the definitions are hidden. ideally ld would have already provided definitions of these symbols when linking libc.so, but the linker script for -shared omits them. to avoid this situation, the dynamic linker now provides its own dummy definitions of the init/fini array symbols for libc.so. since they are hidden, everything binds at ld time and no references remain in the dynamic symbol table. with modern binutils and --gc-sections, both the dummy empty array objects and the code referencing them get dropped at link time, anyway. the _init and _fini symbols are also switched back to using weak definitions rather than weak references since the latter behave somewhat problematically in general, and the weak definition approach was known to work well.
2015-11-20 04:28:08 +03:00
weak_alias(__init_array_start, __init_array_end);
weak_alias(__fini_array_start, __fini_array_end);
static int dl_strcmp(const char *l, const char *r)
{
for (; *l==*r && *l; l++, r++);
return *(unsigned char *)l - *(unsigned char *)r;
}
#define strcmp(l,r) dl_strcmp(l,r)
/* Compute load address for a virtual address in a given dso. */
#if DL_FDPIC
static void *laddr(const struct dso *p, size_t v)
{
size_t j=0;
if (!p->loadmap) return p->base + v;
for (j=0; v-p->loadmap->segs[j].p_vaddr >= p->loadmap->segs[j].p_memsz; j++);
return (void *)(v - p->loadmap->segs[j].p_vaddr + p->loadmap->segs[j].addr);
}
static void *laddr_pg(const struct dso *p, size_t v)
{
size_t j=0;
size_t pgsz = PAGE_SIZE;
if (!p->loadmap) return p->base + v;
for (j=0; ; j++) {
size_t a = p->loadmap->segs[j].p_vaddr;
size_t b = a + p->loadmap->segs[j].p_memsz;
a &= -pgsz;
b += pgsz-1;
b &= -pgsz;
if (v-a<b-a) break;
}
return (void *)(v - p->loadmap->segs[j].p_vaddr + p->loadmap->segs[j].addr);
}
static void (*fdbarrier(void *p))()
{
void (*fd)();
__asm__("" : "=r"(fd) : "0"(p));
return fd;
}
#define fpaddr(p, v) fdbarrier((&(struct funcdesc){ \
laddr(p, v), (p)->got }))
#else
#define laddr(p, v) (void *)((p)->base + (v))
#define laddr_pg(p, v) laddr(p, v)
#define fpaddr(p, v) ((void (*)())laddr(p, v))
#endif
static void decode_vec(size_t *v, size_t *a, size_t cnt)
{
size_t i;
for (i=0; i<cnt; i++) a[i] = 0;
for (; v[0]; v+=2) if (v[0]-1<cnt-1) {
a[0] |= 1UL<<v[0];
a[v[0]] = v[1];
}
}
static int search_vec(size_t *v, size_t *r, size_t key)
{
for (; v[0]!=key; v+=2)
if (!v[0]) return 0;
*r = v[1];
return 1;
}
static uint32_t sysv_hash(const char *s0)
{
const unsigned char *s = (void *)s0;
uint_fast32_t h = 0;
while (*s) {
h = 16*h + *s++;
h ^= h>>24 & 0xf0;
}
return h & 0xfffffff;
}
static uint32_t gnu_hash(const char *s0)
{
const unsigned char *s = (void *)s0;
uint_fast32_t h = 5381;
for (; *s; s++)
h += h*32 + *s;
return h;
}
static Sym *sysv_lookup(const char *s, uint32_t h, struct dso *dso)
{
size_t i;
2012-08-05 10:38:35 +04:00
Sym *syms = dso->syms;
Elf_Symndx *hashtab = dso->hashtab;
2012-08-05 10:38:35 +04:00
char *strings = dso->strings;
for (i=hashtab[2+h%hashtab[0]]; i; i=hashtab[2+hashtab[0]+i]) {
if ((!dso->versym || dso->versym[i] >= 0)
&& (!strcmp(s, strings+syms[i].st_name)))
return syms+i;
}
return 0;
}
static Sym *gnu_lookup(uint32_t h1, uint32_t *hashtab, struct dso *dso, const char *s)
{
uint32_t nbuckets = hashtab[0];
uint32_t *buckets = hashtab + 4 + hashtab[2]*(sizeof(size_t)/4);
uint32_t i = buckets[h1 % nbuckets];
if (!i) return 0;
uint32_t *hashval = buckets + nbuckets + (i - hashtab[1]);
for (h1 |= 1; ; i++) {
uint32_t h2 = *hashval++;
if ((h1 == (h2|1)) && (!dso->versym || dso->versym[i] >= 0)
&& !strcmp(s, dso->strings + dso->syms[i].st_name))
return dso->syms+i;
if (h2 & 1) break;
}
return 0;
}
static Sym *gnu_lookup_filtered(uint32_t h1, uint32_t *hashtab, struct dso *dso, const char *s, uint32_t fofs, size_t fmask)
{
const size_t *bloomwords = (const void *)(hashtab+4);
size_t f = bloomwords[fofs & (hashtab[2]-1)];
if (!(f & fmask)) return 0;
f >>= (h1 >> hashtab[3]) % (8 * sizeof f);
if (!(f & 1)) return 0;
return gnu_lookup(h1, hashtab, dso, s);
}
#define OK_TYPES (1<<STT_NOTYPE | 1<<STT_OBJECT | 1<<STT_FUNC | 1<<STT_COMMON | 1<<STT_TLS)
#define OK_BINDS (1<<STB_GLOBAL | 1<<STB_WEAK | 1<<STB_GNU_UNIQUE)
fix regression in mips dynamic linker this issue caused the address of functions in shared libraries to resolve to their PLT thunks in the main program rather than their correct addresses. it was observed causing crashes, though the mechanism of the crash was not thoroughly investigated. since the issue is very subtle, it calls for some explanation: on all well-behaved archs, GOT entries that belong to the PLT use a special relocation type, typically called JMP_SLOT, so that the dynamic linker can avoid having the jump destinations for the PLT resolve to PLT thunks themselves (they also provide a definition for the symbol, which must be used whenever the address of the function is taken so that all DSOs see the same address). however, the traditional mips PIC ABI lacked such a JMP_SLOT relocation type, presumably because, due to the way PIC works, the address of the PLT thunk was never needed and could always be ignored. prior to commit adf94c19666e687a728bbf398f9a88ea4ea19996, the mips version of reloc.h contained a hack that caused all symbol lookups to be treated like JMP_SLOT, inhibiting undefined symbols from ever being used to resolve symbolic relocations. this hack goes all the way back to commit babf820180368f00742ec65b2050a82380d7c542, when the mips dynamic linker was first made usable. during the recent refactoring to eliminate arch-specific relocation processing (commit adf94c19666e687a728bbf398f9a88ea4ea19996), this hack was overlooked and no equivalent functionality was provided in the new code. fixing the problem is not as simple as adding back an equivalent hack, since there is now also a "non-PIC ABI" that can be used for the main executable, which actually does use a PLT. the closest thing to official documentation I could find for this ABI is nonpic.txt, attached to Message-ID: 20080701202236.GA1534@caradoc.them.org, which can be found in the gcc mailing list archives and elsewhere. per this document, undefined symbols corresponding to PLT thunks have the STO_MIPS_PLT bit set in the symbol's st_other field. thus, I have added an arch-specific rule for mips, applied at the find_sym level rather than the relocation level, to reject undefined symbols with the STO_MIPS_PLT bit clear. the previous hack of treating all mips relocations as JMP_SLOT-like, rather than rejecting the unwanted symbols in find_sym, probably also caused dlsym to wrongly return PLT thunks in place of the correct address of a function under at least some conditions. this should now be fixed, at least for global-scope symbol lookups.
2014-06-30 09:18:14 +04:00
#ifndef ARCH_SYM_REJECT_UND
#define ARCH_SYM_REJECT_UND(s) 0
#endif
#if defined(__GNUC__)
__attribute__((always_inline))
#endif
static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_def, int use_deps)
{
uint32_t h = 0, gh = gnu_hash(s), gho = gh / (8*sizeof(size_t)), *ght;
size_t ghm = 1ul << gh % (8*sizeof(size_t));
struct symdef def = {0};
struct dso **deps = use_deps ? dso->deps : 0;
for (; dso; dso=use_deps ? *deps++ : dso->syms_next) {
Sym *sym;
if ((ght = dso->ghashtab)) {
sym = gnu_lookup_filtered(gh, ght, dso, s, gho, ghm);
} else {
if (!h) h = sysv_hash(s);
sym = sysv_lookup(s, h, dso);
}
if (!sym) continue;
if (!sym->st_shndx)
fix regression in mips dynamic linker this issue caused the address of functions in shared libraries to resolve to their PLT thunks in the main program rather than their correct addresses. it was observed causing crashes, though the mechanism of the crash was not thoroughly investigated. since the issue is very subtle, it calls for some explanation: on all well-behaved archs, GOT entries that belong to the PLT use a special relocation type, typically called JMP_SLOT, so that the dynamic linker can avoid having the jump destinations for the PLT resolve to PLT thunks themselves (they also provide a definition for the symbol, which must be used whenever the address of the function is taken so that all DSOs see the same address). however, the traditional mips PIC ABI lacked such a JMP_SLOT relocation type, presumably because, due to the way PIC works, the address of the PLT thunk was never needed and could always be ignored. prior to commit adf94c19666e687a728bbf398f9a88ea4ea19996, the mips version of reloc.h contained a hack that caused all symbol lookups to be treated like JMP_SLOT, inhibiting undefined symbols from ever being used to resolve symbolic relocations. this hack goes all the way back to commit babf820180368f00742ec65b2050a82380d7c542, when the mips dynamic linker was first made usable. during the recent refactoring to eliminate arch-specific relocation processing (commit adf94c19666e687a728bbf398f9a88ea4ea19996), this hack was overlooked and no equivalent functionality was provided in the new code. fixing the problem is not as simple as adding back an equivalent hack, since there is now also a "non-PIC ABI" that can be used for the main executable, which actually does use a PLT. the closest thing to official documentation I could find for this ABI is nonpic.txt, attached to Message-ID: 20080701202236.GA1534@caradoc.them.org, which can be found in the gcc mailing list archives and elsewhere. per this document, undefined symbols corresponding to PLT thunks have the STO_MIPS_PLT bit set in the symbol's st_other field. thus, I have added an arch-specific rule for mips, applied at the find_sym level rather than the relocation level, to reject undefined symbols with the STO_MIPS_PLT bit clear. the previous hack of treating all mips relocations as JMP_SLOT-like, rather than rejecting the unwanted symbols in find_sym, probably also caused dlsym to wrongly return PLT thunks in place of the correct address of a function under at least some conditions. this should now be fixed, at least for global-scope symbol lookups.
2014-06-30 09:18:14 +04:00
if (need_def || (sym->st_info&0xf) == STT_TLS
|| ARCH_SYM_REJECT_UND(sym))
continue;
if (!sym->st_value)
if ((sym->st_info&0xf) != STT_TLS)
continue;
if (!(1<<(sym->st_info&0xf) & OK_TYPES)) continue;
if (!(1<<(sym->st_info>>4) & OK_BINDS)) continue;
def.sym = sym;
def.dso = dso;
break;
}
return def;
}
static struct symdef find_sym(struct dso *dso, const char *s, int need_def)
{
return find_sym2(dso, s, need_def, 0);
}
static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stride)
{
unsigned char *base = dso->base;
Sym *syms = dso->syms;
char *strings = dso->strings;
Sym *sym;
const char *name;
void *ctx;
int type;
int sym_index;
struct symdef def;
size_t *reloc_addr;
size_t sym_val;
size_t tls_val;
size_t addend;
reprocess all libc/ldso symbolic relocations in dynamic linking stage 3 commit f3ddd173806fd5c60b3f034528ca24542aecc5b9 introduced early relocations and subsequent reprocessing as part of the dynamic linker bootstrap overhaul, to allow use of arbitrary libc functions before the main application and libraries are loaded, but only reprocessed GOT/PLT relocation types. commit c093e2e8201524db0d638920e76bcb6b1d925f3a added reprocessing of non-GOT/PLT relocations to fix an actual regression that was observed on powerpc, but only for RELA format tables with out-of-line addends. REL table (inline addends at the relocation address) reprocessing is trickier because the first relocation pass clobbers the addends. this patch extends symbolic relocation reprocessing for libc/ldso to support all relocation types, whether REL or RELA format tables are used. it is believed not to alter behavior on any existing archs for the current dynamic linker and libc code. the motivations for this change are consistency and future-proofing. it ensures that behavior does not differ depending on whether REL or RELA tables are used, which could lead to undetected arch-specific bugs. it also ensures that, if in the future code depending on additional relocation types is added to libc.so, either at the source level or as part of the compiler runtime that gets pulled in (for example, soft-float with TLS for fenv), the new code will work properly. the implementation concept is simple: stage 2 of the dynamic linker counts the number of symbolic relocations in the libc/ldso REL table and allocates a VLA to save their addends into; stage 3 then uses the saved addends in place of the inline ones which were clobbered. for stack safety, a hard limit (currently 4k) is imposed on the number of such addends; this should be a couple orders of magnitude larger than the actual need. this number is not a runtime variable that could break fail-safety; it is constant for a given libc.so build.
2015-05-26 06:33:59 +03:00
int skip_relative = 0, reuse_addends = 0, save_slot = 0;
if (dso == &ldso) {
/* Only ldso's REL table needs addend saving/reuse. */
if (rel == apply_addends_to)
reuse_addends = 1;
skip_relative = 1;
}
for (; rel_size; rel+=stride, rel_size-=stride*sizeof(size_t)) {
if (skip_relative && IS_RELATIVE(rel[1], dso->syms)) continue;
type = R_TYPE(rel[1]);
if (type == REL_NONE) continue;
reloc_addr = laddr(dso, rel[0]);
if (stride > 2) {
addend = rel[2];
} else if (type==REL_GOT || type==REL_PLT|| type==REL_COPY) {
addend = 0;
} else if (reuse_addends) {
/* Save original addend in stage 2 where the dso
* chain consists of just ldso; otherwise read back
* saved addend since the inline one was clobbered. */
if (head==&ldso)
saved_addends[save_slot] = *reloc_addr;
addend = saved_addends[save_slot++];
} else {
addend = *reloc_addr;
}
sym_index = R_SYM(rel[1]);
if (sym_index) {
sym = syms + sym_index;
name = strings + sym->st_name;
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
ctx = type==REL_COPY ? head->syms_next : head;
def = (sym->st_info>>4) == STB_LOCAL
? (struct symdef){ .dso = dso, .sym = sym }
: find_sym(ctx, name, type==REL_PLT);
if (!def.sym && (sym->st_shndx != SHN_UNDEF
|| sym->st_info>>4 != STB_WEAK)) {
if (dso->lazy && (type==REL_PLT || type==REL_GOT)) {
dso->lazy[3*dso->lazy_cnt+0] = rel[0];
dso->lazy[3*dso->lazy_cnt+1] = rel[1];
dso->lazy[3*dso->lazy_cnt+2] = addend;
dso->lazy_cnt++;
continue;
}
error("Error relocating %s: %s: symbol not found",
dso->name, name);
if (runtime) longjmp(*rtld_fail, 1);
continue;
}
} else {
sym = 0;
def.sym = 0;
def.dso = dso;
}
sym_val = def.sym ? (size_t)laddr(def.dso, def.sym->st_value) : 0;
tls_val = def.sym ? def.sym->st_value : 0;
if ((type == REL_TPOFF || type == REL_TPOFF_NEG)
&& def.dso->tls_id > static_tls_cnt) {
error("Error relocating %s: %s: initial-exec TLS "
"resolves to dynamic definition in %s",
dso->name, name, def.dso->name);
longjmp(*rtld_fail, 1);
}
switch(type) {
case REL_OFFSET:
addend -= (size_t)reloc_addr;
case REL_SYMBOLIC:
case REL_GOT:
case REL_PLT:
*reloc_addr = sym_val + addend;
break;
case REL_USYMBOLIC:
memcpy(reloc_addr, &(size_t){sym_val + addend}, sizeof(size_t));
break;
case REL_RELATIVE:
*reloc_addr = (size_t)base + addend;
break;
case REL_SYM_OR_REL:
if (sym) *reloc_addr = sym_val + addend;
else *reloc_addr = (size_t)base + addend;
break;
case REL_COPY:
memcpy(reloc_addr, (void *)sym_val, sym->st_size);
break;
case REL_OFFSET32:
*(uint32_t *)reloc_addr = sym_val + addend
- (size_t)reloc_addr;
break;
case REL_FUNCDESC:
*reloc_addr = def.sym ? (size_t)(def.dso->funcdescs
+ (def.sym - def.dso->syms)) : 0;
break;
case REL_FUNCDESC_VAL:
if ((sym->st_info&0xf) == STT_SECTION) *reloc_addr += sym_val;
else *reloc_addr = sym_val;
reloc_addr[1] = def.sym ? (size_t)def.dso->got : 0;
break;
case REL_DTPMOD:
*reloc_addr = def.dso->tls_id;
break;
case REL_DTPOFF:
*reloc_addr = tls_val + addend - DTP_OFFSET;
break;
#ifdef TLS_ABOVE_TP
case REL_TPOFF:
*reloc_addr = tls_val + def.dso->tls.offset + TPOFF_K + addend;
break;
#else
case REL_TPOFF:
*reloc_addr = tls_val - def.dso->tls.offset + addend;
break;
case REL_TPOFF_NEG:
*reloc_addr = def.dso->tls.offset - tls_val + addend;
break;
#endif
case REL_TLSDESC:
if (stride<3) addend = reloc_addr[1];
if (def.dso->tls_id > static_tls_cnt) {
struct td_index *new = malloc(sizeof *new);
if (!new) {
error(
"Error relocating %s: cannot allocate TLSDESC for %s",
dso->name, sym ? name : "(local)" );
longjmp(*rtld_fail, 1);
}
new->next = dso->td_index;
dso->td_index = new;
new->args[0] = def.dso->tls_id;
new->args[1] = tls_val + addend - DTP_OFFSET;
reloc_addr[0] = (size_t)__tlsdesc_dynamic;
reloc_addr[1] = (size_t)new;
} else {
reloc_addr[0] = (size_t)__tlsdesc_static;
#ifdef TLS_ABOVE_TP
reloc_addr[1] = tls_val + def.dso->tls.offset
+ TPOFF_K + addend;
#else
reloc_addr[1] = tls_val - def.dso->tls.offset
+ addend;
#endif
}
#ifdef TLSDESC_BACKWARDS
/* Some archs (32-bit ARM at least) invert the order of
* the descriptor members. Fix them up here. */
size_t tmp = reloc_addr[0];
reloc_addr[0] = reloc_addr[1];
reloc_addr[1] = tmp;
#endif
break;
default:
error("Error relocating %s: unsupported relocation type %d",
dso->name, type);
if (runtime) longjmp(*rtld_fail, 1);
continue;
}
}
}
static void redo_lazy_relocs()
{
struct dso *p = lazy_head, *next;
lazy_head = 0;
for (; p; p=next) {
next = p->lazy_next;
size_t size = p->lazy_cnt*3*sizeof(size_t);
p->lazy_cnt = 0;
do_relocs(p, p->lazy, size, 3);
if (p->lazy_cnt) {
p->lazy_next = lazy_head;
lazy_head = p;
} else {
free(p->lazy);
p->lazy = 0;
p->lazy_next = 0;
}
}
}
/* A huge hack: to make up for the wastefulness of shared libraries
* needing at least a page of dirty memory even if they have no global
* data, we reclaim the gaps at the beginning and end of writable maps
* and "donate" them to the heap. */
2014-03-25 16:13:27 +04:00
static void reclaim(struct dso *dso, size_t start, size_t end)
{
2014-03-25 16:13:27 +04:00
if (start >= dso->relro_start && start < dso->relro_end) start = dso->relro_end;
if (end >= dso->relro_start && end < dso->relro_end) end = dso->relro_start;
if (start >= end) return;
char *base = laddr_pg(dso, start);
__malloc_donate(base, base+(end-start));
}
static void reclaim_gaps(struct dso *dso)
{
Phdr *ph = dso->phdr;
size_t phcnt = dso->phnum;
for (; phcnt--; ph=(void *)((char *)ph+dso->phentsize)) {
if (ph->p_type!=PT_LOAD) continue;
if ((ph->p_flags&(PF_R|PF_W))!=(PF_R|PF_W)) continue;
2014-03-25 16:13:27 +04:00
reclaim(dso, ph->p_vaddr & -PAGE_SIZE, ph->p_vaddr);
reclaim(dso, ph->p_vaddr+ph->p_memsz,
ph->p_vaddr+ph->p_memsz+PAGE_SIZE-1 & -PAGE_SIZE);
}
}
static void *mmap_fixed(void *p, size_t n, int prot, int flags, int fd, off_t off)
{
static int no_map_fixed;
char *q;
if (!no_map_fixed) {
q = mmap(p, n, prot, flags|MAP_FIXED, fd, off);
if (!DL_NOMMU_SUPPORT || q != MAP_FAILED || errno != EINVAL)
return q;
no_map_fixed = 1;
}
/* Fallbacks for MAP_FIXED failure on NOMMU kernels. */
if (flags & MAP_ANONYMOUS) {
memset(p, 0, n);
return p;
}
ssize_t r;
if (lseek(fd, off, SEEK_SET) < 0) return MAP_FAILED;
for (q=p; n; q+=r, off+=r, n-=r) {
r = read(fd, q, n);
if (r < 0 && errno != EINTR) return MAP_FAILED;
if (!r) {
memset(q, 0, n);
break;
}
}
return p;
}
static void unmap_library(struct dso *dso)
{
if (dso->loadmap) {
size_t i;
for (i=0; i<dso->loadmap->nsegs; i++) {
if (!dso->loadmap->segs[i].p_memsz)
continue;
munmap((void *)dso->loadmap->segs[i].addr,
dso->loadmap->segs[i].p_memsz);
}
free(dso->loadmap);
} else if (dso->map && dso->map_len) {
munmap(dso->map, dso->map_len);
}
}
static void *map_library(int fd, struct dso *dso)
{
Ehdr buf[(896+sizeof(Ehdr))/sizeof(Ehdr)];
void *allocated_buf=0;
size_t phsize;
size_t addr_min=SIZE_MAX, addr_max=0, map_len;
size_t this_min, this_max;
size_t nsegs = 0;
off_t off_start;
Ehdr *eh;
Phdr *ph, *ph0;
unsigned prot;
unsigned char *map=MAP_FAILED, *base;
size_t dyn=0;
size_t tls_image=0;
size_t i;
ssize_t l = read(fd, buf, sizeof buf);
eh = buf;
if (l<0) return 0;
if (l<sizeof *eh || (eh->e_type != ET_DYN && eh->e_type != ET_EXEC))
goto noexec;
phsize = eh->e_phentsize * eh->e_phnum;
if (phsize > sizeof buf - sizeof *eh) {
allocated_buf = malloc(phsize);
if (!allocated_buf) return 0;
l = pread(fd, allocated_buf, phsize, eh->e_phoff);
if (l < 0) goto error;
if (l != phsize) goto noexec;
ph = ph0 = allocated_buf;
} else if (eh->e_phoff + phsize > l) {
l = pread(fd, buf+1, phsize, eh->e_phoff);
if (l < 0) goto error;
if (l != phsize) goto noexec;
ph = ph0 = (void *)(buf + 1);
} else {
ph = ph0 = (void *)((char *)buf + eh->e_phoff);
}
for (i=eh->e_phnum; i; i--, ph=(void *)((char *)ph+eh->e_phentsize)) {
if (ph->p_type == PT_DYNAMIC) {
dyn = ph->p_vaddr;
} else if (ph->p_type == PT_TLS) {
tls_image = ph->p_vaddr;
dso->tls.align = ph->p_align;
dso->tls.len = ph->p_filesz;
dso->tls.size = ph->p_memsz;
2014-03-25 16:13:27 +04:00
} else if (ph->p_type == PT_GNU_RELRO) {
dso->relro_start = ph->p_vaddr & -PAGE_SIZE;
dso->relro_end = (ph->p_vaddr + ph->p_memsz) & -PAGE_SIZE;
} else if (ph->p_type == PT_GNU_STACK) {
if (!runtime && ph->p_memsz > __default_stacksize) {
__default_stacksize =
ph->p_memsz < DEFAULT_STACK_MAX ?
ph->p_memsz : DEFAULT_STACK_MAX;
}
}
if (ph->p_type != PT_LOAD) continue;
nsegs++;
if (ph->p_vaddr < addr_min) {
addr_min = ph->p_vaddr;
off_start = ph->p_offset;
prot = (((ph->p_flags&PF_R) ? PROT_READ : 0) |
((ph->p_flags&PF_W) ? PROT_WRITE: 0) |
((ph->p_flags&PF_X) ? PROT_EXEC : 0));
}
if (ph->p_vaddr+ph->p_memsz > addr_max) {
addr_max = ph->p_vaddr+ph->p_memsz;
}
}
if (!dyn) goto noexec;
if (DL_FDPIC && !(eh->e_flags & FDPIC_CONSTDISP_FLAG)) {
dso->loadmap = calloc(1, sizeof *dso->loadmap
+ nsegs * sizeof *dso->loadmap->segs);
if (!dso->loadmap) goto error;
dso->loadmap->nsegs = nsegs;
for (ph=ph0, i=0; i<nsegs; ph=(void *)((char *)ph+eh->e_phentsize)) {
if (ph->p_type != PT_LOAD) continue;
prot = (((ph->p_flags&PF_R) ? PROT_READ : 0) |
((ph->p_flags&PF_W) ? PROT_WRITE: 0) |
((ph->p_flags&PF_X) ? PROT_EXEC : 0));
map = mmap(0, ph->p_memsz + (ph->p_vaddr & PAGE_SIZE-1),
prot, MAP_PRIVATE,
fd, ph->p_offset & -PAGE_SIZE);
if (map == MAP_FAILED) {
unmap_library(dso);
goto error;
}
dso->loadmap->segs[i].addr = (size_t)map +
(ph->p_vaddr & PAGE_SIZE-1);
dso->loadmap->segs[i].p_vaddr = ph->p_vaddr;
dso->loadmap->segs[i].p_memsz = ph->p_memsz;
i++;
if (prot & PROT_WRITE) {
size_t brk = (ph->p_vaddr & PAGE_SIZE-1)
+ ph->p_filesz;
size_t pgbrk = brk + PAGE_SIZE-1 & -PAGE_SIZE;
size_t pgend = brk + ph->p_memsz - ph->p_filesz
+ PAGE_SIZE-1 & -PAGE_SIZE;
if (pgend > pgbrk && mmap_fixed(map+pgbrk,
pgend-pgbrk, prot,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS,
-1, off_start) == MAP_FAILED)
goto error;
memset(map + brk, 0, pgbrk-brk);
}
}
map = (void *)dso->loadmap->segs[0].addr;
map_len = 0;
goto done_mapping;
}
addr_max += PAGE_SIZE-1;
addr_max &= -PAGE_SIZE;
addr_min &= -PAGE_SIZE;
off_start &= -PAGE_SIZE;
map_len = addr_max - addr_min + off_start;
/* The first time, we map too much, possibly even more than
* the length of the file. This is okay because we will not
* use the invalid part; we just need to reserve the right
* amount of virtual address space to map over later. */
map = DL_NOMMU_SUPPORT
? mmap((void *)addr_min, map_len, PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)
: mmap((void *)addr_min, map_len, prot,
MAP_PRIVATE, fd, off_start);
if (map==MAP_FAILED) goto error;
dso->map = map;
dso->map_len = map_len;
/* If the loaded file is not relocatable and the requested address is
* not available, then the load operation must fail. */
if (eh->e_type != ET_DYN && addr_min && map!=(void *)addr_min) {
errno = EBUSY;
goto error;
}
base = map - addr_min;
dso->phdr = 0;
dso->phnum = 0;
for (ph=ph0, i=eh->e_phnum; i; i--, ph=(void *)((char *)ph+eh->e_phentsize)) {
if (ph->p_type != PT_LOAD) continue;
/* Check if the programs headers are in this load segment, and
* if so, record the address for use by dl_iterate_phdr. */
if (!dso->phdr && eh->e_phoff >= ph->p_offset
&& eh->e_phoff+phsize <= ph->p_offset+ph->p_filesz) {
dso->phdr = (void *)(base + ph->p_vaddr
+ (eh->e_phoff-ph->p_offset));
dso->phnum = eh->e_phnum;
dso->phentsize = eh->e_phentsize;
}
this_min = ph->p_vaddr & -PAGE_SIZE;
this_max = ph->p_vaddr+ph->p_memsz+PAGE_SIZE-1 & -PAGE_SIZE;
off_start = ph->p_offset & -PAGE_SIZE;
prot = (((ph->p_flags&PF_R) ? PROT_READ : 0) |
((ph->p_flags&PF_W) ? PROT_WRITE: 0) |
((ph->p_flags&PF_X) ? PROT_EXEC : 0));
/* Reuse the existing mapping for the lowest-address LOAD */
if ((ph->p_vaddr & -PAGE_SIZE) != addr_min || DL_NOMMU_SUPPORT)
if (mmap_fixed(base+this_min, this_max-this_min, prot, MAP_PRIVATE|MAP_FIXED, fd, off_start) == MAP_FAILED)
goto error;
if (ph->p_memsz > ph->p_filesz && (ph->p_flags&PF_W)) {
size_t brk = (size_t)base+ph->p_vaddr+ph->p_filesz;
size_t pgbrk = brk+PAGE_SIZE-1 & -PAGE_SIZE;
memset((void *)brk, 0, pgbrk-brk & PAGE_SIZE-1);
if (pgbrk-(size_t)base < this_max && mmap_fixed((void *)pgbrk, (size_t)base+this_max-pgbrk, prot, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) == MAP_FAILED)
goto error;
}
}
2011-06-29 08:29:08 +04:00
for (i=0; ((size_t *)(base+dyn))[i]; i+=2)
if (((size_t *)(base+dyn))[i]==DT_TEXTREL) {
if (mprotect(map, map_len, PROT_READ|PROT_WRITE|PROT_EXEC)
&& errno != ENOSYS)
goto error;
2011-06-29 08:29:08 +04:00
break;
}
done_mapping:
dso->base = base;
dso->dynv = laddr(dso, dyn);
if (dso->tls.size) dso->tls.image = laddr(dso, tls_image);
free(allocated_buf);
return map;
noexec:
errno = ENOEXEC;
error:
if (map!=MAP_FAILED) unmap_library(dso);
free(allocated_buf);
return 0;
}
static int path_open(const char *name, const char *s, char *buf, size_t buf_size)
{
size_t l;
int fd;
for (;;) {
s += strspn(s, ":\n");
l = strcspn(s, ":\n");
if (l-1 >= INT_MAX) return -1;
if (snprintf(buf, buf_size, "%.*s/%s", (int)l, s, name) < buf_size) {
if ((fd = open(buf, O_RDONLY|O_CLOEXEC))>=0) return fd;
switch (errno) {
case ENOENT:
case ENOTDIR:
case EACCES:
case ENAMETOOLONG:
break;
default:
/* Any negative value but -1 will inhibit
* futher path search. */
return -2;
}
}
s += l;
}
}
static int fixup_rpath(struct dso *p, char *buf, size_t buf_size)
{
size_t n, l;
const char *s, *t, *origin;
char *d;
if (p->rpath || !p->rpath_orig) return 0;
if (!strchr(p->rpath_orig, '$')) {
p->rpath = p->rpath_orig;
return 0;
}
n = 0;
s = p->rpath_orig;
while ((t=strchr(s, '$'))) {
if (strncmp(t, "$ORIGIN", 7) && strncmp(t, "${ORIGIN}", 9))
return 0;
s = t+1;
n++;
}
if (n > SSIZE_MAX/PATH_MAX) return 0;
if (p->kernel_mapped) {
/* $ORIGIN searches cannot be performed for the main program
* when it is suid/sgid/AT_SECURE. This is because the
* pathname is under the control of the caller of execve.
* For libraries, however, $ORIGIN can be processed safely
* since the library's pathname came from a trusted source
* (either system paths or a call to dlopen). */
if (libc.secure)
return 0;
l = readlink("/proc/self/exe", buf, buf_size);
if (l == -1) switch (errno) {
case ENOENT:
case ENOTDIR:
case EACCES:
break;
default:
return -1;
}
if (l >= buf_size)
return 0;
buf[l] = 0;
origin = buf;
} else {
origin = p->name;
}
t = strrchr(origin, '/');
if (t) {
l = t-origin;
} else {
/* Normally p->name will always be an absolute or relative
* pathname containing at least one '/' character, but in the
* case where ldso was invoked as a command to execute a
* program in the working directory, app.name may not. Fix. */
origin = ".";
l = 1;
}
/* Disallow non-absolute origins for suid/sgid/AT_SECURE. */
if (libc.secure && *origin != '/')
return 0;
p->rpath = malloc(strlen(p->rpath_orig) + n*l + 1);
if (!p->rpath) return -1;
d = p->rpath;
s = p->rpath_orig;
while ((t=strchr(s, '$'))) {
memcpy(d, s, t-s);
d += t-s;
memcpy(d, origin, l);
d += l;
/* It was determined previously that the '$' is followed
* either by "ORIGIN" or "{ORIGIN}". */
s = t + 7 + 2*(t[1]=='{');
}
strcpy(d, s);
return 0;
}
static void decode_dyn(struct dso *p)
{
size_t dyn[DYN_CNT];
decode_vec(p->dynv, dyn, DYN_CNT);
p->syms = laddr(p, dyn[DT_SYMTAB]);
p->strings = laddr(p, dyn[DT_STRTAB]);
if (dyn[0]&(1<<DT_HASH))
p->hashtab = laddr(p, dyn[DT_HASH]);
if (dyn[0]&(1<<DT_RPATH))
p->rpath_orig = p->strings + dyn[DT_RPATH];
if (dyn[0]&(1<<DT_RUNPATH))
p->rpath_orig = p->strings + dyn[DT_RUNPATH];
if (dyn[0]&(1<<DT_PLTGOT))
p->got = laddr(p, dyn[DT_PLTGOT]);
if (search_vec(p->dynv, dyn, DT_GNU_HASH))
p->ghashtab = laddr(p, *dyn);
if (search_vec(p->dynv, dyn, DT_VERSYM))
p->versym = laddr(p, *dyn);
}
static size_t count_syms(struct dso *p)
{
if (p->hashtab) return p->hashtab[1];
size_t nsym, i;
uint32_t *buckets = p->ghashtab + 4 + (p->ghashtab[2]*sizeof(size_t)/4);
uint32_t *hashval;
for (i = nsym = 0; i < p->ghashtab[0]; i++) {
if (buckets[i] > nsym)
nsym = buckets[i];
}
if (nsym) {
hashval = buckets + p->ghashtab[0] + (nsym - p->ghashtab[1]);
do nsym++;
while (!(*hashval++ & 1));
}
return nsym;
}
static void *dl_mmap(size_t n)
{
void *p;
int prot = PROT_READ|PROT_WRITE, flags = MAP_ANONYMOUS|MAP_PRIVATE;
#ifdef SYS_mmap2
p = (void *)__syscall(SYS_mmap2, 0, n, prot, flags, -1, 0);
#else
p = (void *)__syscall(SYS_mmap, 0, n, prot, flags, -1, 0);
#endif
return (unsigned long)p > -4096UL ? 0 : p;
}
static void makefuncdescs(struct dso *p)
{
static int self_done;
size_t nsym = count_syms(p);
size_t i, size = nsym * sizeof(*p->funcdescs);
if (!self_done) {
p->funcdescs = dl_mmap(size);
self_done = 1;
} else {
p->funcdescs = malloc(size);
}
if (!p->funcdescs) {
if (!runtime) a_crash();
error("Error allocating function descriptors for %s", p->name);
longjmp(*rtld_fail, 1);
}
for (i=0; i<nsym; i++) {
if ((p->syms[i].st_info&0xf)==STT_FUNC && p->syms[i].st_shndx) {
p->funcdescs[i].addr = laddr(p, p->syms[i].st_value);
p->funcdescs[i].got = p->got;
} else {
p->funcdescs[i].addr = 0;
p->funcdescs[i].got = 0;
}
}
}
static struct dso *load_library(const char *name, struct dso *needed_by)
{
char buf[2*NAME_MAX+2];
const char *pathname;
unsigned char *map;
struct dso *p, temp_dso = {0};
int fd;
struct stat st;
size_t alloc_size;
int n_th = 0;
int is_self = 0;
if (!*name) {
errno = EINVAL;
return 0;
}
/* Catch and block attempts to reload the implementation itself */
if (name[0]=='l' && name[1]=='i' && name[2]=='b') {
static const char reserved[] =
"c.pthread.rt.m.dl.util.xnet.";
const char *rp, *next;
for (rp=reserved; *rp; rp=next) {
next = strchr(rp, '.') + 1;
if (strncmp(name+3, rp, next-rp) == 0)
break;
}
if (*rp) {
if (ldd_mode) {
/* Track which names have been resolved
* and only report each one once. */
static unsigned reported;
unsigned mask = 1U<<(rp-reserved);
if (!(reported & mask)) {
reported |= mask;
dprintf(1, "\t%s => %s (%p)\n",
name, ldso.name,
ldso.base);
}
}
is_self = 1;
}
}
if (!strcmp(name, ldso.name)) is_self = 1;
if (is_self) {
if (!ldso.prev) {
tail->next = &ldso;
ldso.prev = tail;
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
tail = &ldso;
}
return &ldso;
}
if (strchr(name, '/')) {
pathname = name;
fd = open(name, O_RDONLY|O_CLOEXEC);
} else {
/* Search for the name to see if it's already loaded */
for (p=head->next; p; p=p->next) {
if (p->shortname && !strcmp(p->shortname, name)) {
return p;
}
}
if (strlen(name) > NAME_MAX) return 0;
fd = -1;
if (env_path) fd = path_open(name, env_path, buf, sizeof buf);
for (p=needed_by; fd == -1 && p; p=p->needed_by) {
if (fixup_rpath(p, buf, sizeof buf) < 0)
fd = -2; /* Inhibit further search. */
if (p->rpath)
fd = path_open(name, p->rpath, buf, sizeof buf);
}
if (fd == -1) {
if (!sys_path) {
char *prefix = 0;
size_t prefix_len;
if (ldso.name[0]=='/') {
char *s, *t, *z;
for (s=t=z=ldso.name; *s; s++)
if (*s=='/') z=t, t=s;
prefix_len = z-ldso.name;
if (prefix_len < PATH_MAX)
prefix = ldso.name;
}
if (!prefix) {
prefix = "";
prefix_len = 0;
}
char etc_ldso_path[prefix_len + 1
+ sizeof "/etc/ld-musl-" LDSO_ARCH ".path"];
snprintf(etc_ldso_path, sizeof etc_ldso_path,
"%.*s/etc/ld-musl-" LDSO_ARCH ".path",
(int)prefix_len, prefix);
FILE *f = fopen(etc_ldso_path, "rbe");
if (f) {
if (getdelim(&sys_path, (size_t[1]){0}, 0, f) <= 0) {
free(sys_path);
sys_path = "";
}
fclose(f);
} else if (errno != ENOENT) {
sys_path = "";
}
}
if (!sys_path) sys_path = "/lib:/usr/local/lib:/usr/lib";
fd = path_open(name, sys_path, buf, sizeof buf);
}
pathname = buf;
}
if (fd < 0) return 0;
if (fstat(fd, &st) < 0) {
close(fd);
return 0;
}
for (p=head->next; p; p=p->next) {
if (p->dev == st.st_dev && p->ino == st.st_ino) {
/* If this library was previously loaded with a
* pathname but a search found the same inode,
* setup its shortname so it can be found by name. */
if (!p->shortname && pathname != name)
p->shortname = strrchr(p->name, '/')+1;
close(fd);
return p;
}
}
map = noload ? 0 : map_library(fd, &temp_dso);
close(fd);
if (!map) return 0;
/* Avoid the danger of getting two versions of libc mapped into the
* same process when an absolute pathname was used. The symbols
* checked are chosen to catch both musl and glibc, and to avoid
* false positives from interposition-hack libraries. */
decode_dyn(&temp_dso);
if (find_sym(&temp_dso, "__libc_start_main", 1).sym &&
find_sym(&temp_dso, "stdin", 1).sym) {
unmap_library(&temp_dso);
return load_library("libc.so", needed_by);
}
/* Past this point, if we haven't reached runtime yet, ldso has
* committed either to use the mapped library or to abort execution.
* Unmapping is not possible, so we can safely reclaim gaps. */
if (!runtime) reclaim_gaps(&temp_dso);
/* Allocate storage for the new DSO. When there is TLS, this
* storage must include a reservation for all pre-existing
* threads to obtain copies of both the new TLS, and an
* extended DTV capable of storing an additional slot for
* the newly-loaded DSO. */
alloc_size = sizeof *p + strlen(pathname) + 1;
if (runtime && temp_dso.tls.image) {
size_t per_th = temp_dso.tls.size + temp_dso.tls.align
+ sizeof(void *) * (tls_cnt+3);
n_th = libc.threads_minus_1 + 1;
if (n_th > SSIZE_MAX / per_th) alloc_size = SIZE_MAX;
else alloc_size += n_th * per_th;
}
p = calloc(1, alloc_size);
if (!p) {
unmap_library(&temp_dso);
return 0;
}
memcpy(p, &temp_dso, sizeof temp_dso);
p->dev = st.st_dev;
p->ino = st.st_ino;
p->needed_by = needed_by;
p->name = p->buf;
p->runtime_loaded = runtime;
strcpy(p->name, pathname);
/* Add a shortname only if name arg was not an explicit pathname. */
if (pathname != name) p->shortname = strrchr(p->name, '/')+1;
if (p->tls.image) {
p->tls_id = ++tls_cnt;
tls_align = MAXP2(tls_align, p->tls.align);
#ifdef TLS_ABOVE_TP
p->tls.offset = tls_offset + ( (p->tls.align-1) &
(-tls_offset + (uintptr_t)p->tls.image) );
tls_offset = p->tls.offset + p->tls.size;
#else
tls_offset += p->tls.size + p->tls.align - 1;
tls_offset -= (tls_offset + (uintptr_t)p->tls.image)
& (p->tls.align-1);
p->tls.offset = tls_offset;
#endif
p->new_dtv = (void *)(-sizeof(size_t) &
(uintptr_t)(p->name+strlen(p->name)+sizeof(size_t)));
p->new_tls = (void *)(p->new_dtv + n_th*(tls_cnt+1));
if (tls_tail) tls_tail->next = &p->tls;
else libc.tls_head = &p->tls;
tls_tail = &p->tls;
}
tail->next = p;
p->prev = tail;
tail = p;
if (DL_FDPIC) makefuncdescs(p);
if (ldd_mode) dprintf(1, "\t%s => %s (%p)\n", name, pathname, p->base);
return p;
}
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
static void load_direct_deps(struct dso *p)
{
size_t i, cnt=0;
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (p->deps) return;
/* For head, all preloads are direct pseudo-dependencies.
* Count and include them now to avoid realloc later. */
if (p==head) for (struct dso *q=p->next; q; q=q->next)
cnt++;
for (i=0; p->dynv[i]; i+=2)
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (p->dynv[i] == DT_NEEDED) cnt++;
/* Use builtin buffer for apps with no external deps, to
* preserve property of no runtime failure paths. */
p->deps = (p==head && cnt<2) ? builtin_deps :
calloc(cnt+1, sizeof *p->deps);
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (!p->deps) {
error("Error loading dependencies for %s", p->name);
if (runtime) longjmp(*rtld_fail, 1);
}
cnt=0;
if (p==head) for (struct dso *q=p->next; q; q=q->next)
p->deps[cnt++] = q;
for (i=0; p->dynv[i]; i+=2) {
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (p->dynv[i] != DT_NEEDED) continue;
struct dso *dep = load_library(p->strings + p->dynv[i+1], p);
if (!dep) {
error("Error loading shared library %s: %m (needed by %s)",
p->strings + p->dynv[i+1], p->name);
if (runtime) longjmp(*rtld_fail, 1);
continue;
}
p->deps[cnt++] = dep;
}
p->deps[cnt] = 0;
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
p->ndeps_direct = cnt;
}
static void load_deps(struct dso *p)
{
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (p->deps) return;
for (; p; p=p->next)
load_direct_deps(p);
}
static void extend_bfs_deps(struct dso *p)
{
size_t i, j, cnt, ndeps_all;
struct dso **tmp;
/* Can't use realloc if the original p->deps was allocated at
* program entry and malloc has been replaced, or if it's
* the builtin non-allocated trivial main program deps array. */
int no_realloc = (__malloc_replaced && !p->runtime_loaded)
|| p->deps == builtin_deps;
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (p->bfs_built) return;
ndeps_all = p->ndeps_direct;
/* Mark existing (direct) deps so they won't be duplicated. */
for (i=0; p->deps[i]; i++)
p->deps[i]->mark = 1;
/* For each dependency already in the list, copy its list of direct
* dependencies to the list, excluding any items already in the
* list. Note that the list this loop iterates over will grow during
* the loop, but since duplicates are excluded, growth is bounded. */
for (i=0; p->deps[i]; i++) {
struct dso *dep = p->deps[i];
for (j=cnt=0; j<dep->ndeps_direct; j++)
if (!dep->deps[j]->mark) cnt++;
tmp = no_realloc ?
malloc(sizeof(*tmp) * (ndeps_all+cnt+1)) :
realloc(p->deps, sizeof(*tmp) * (ndeps_all+cnt+1));
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (!tmp) {
error("Error recording dependencies for %s", p->name);
if (runtime) longjmp(*rtld_fail, 1);
continue;
}
if (no_realloc) {
memcpy(tmp, p->deps, sizeof(*tmp) * (ndeps_all+1));
no_realloc = 0;
}
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
p->deps = tmp;
for (j=0; j<dep->ndeps_direct; j++) {
if (dep->deps[j]->mark) continue;
dep->deps[j]->mark = 1;
p->deps[ndeps_all++] = dep->deps[j];
}
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
p->deps[ndeps_all] = 0;
}
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
p->bfs_built = 1;
for (p=head; p; p=p->next)
p->mark = 0;
}
2011-08-16 08:24:36 +04:00
static void load_preload(char *s)
{
int tmp;
char *z;
for (z=s; *z; s=z) {
for ( ; *s && (isspace(*s) || *s==':'); s++);
for (z=s; *z && !isspace(*z) && *z!=':'; z++);
2011-08-16 08:24:36 +04:00
tmp = *z;
*z = 0;
load_library(s, 0);
2011-08-16 08:24:36 +04:00
*z = tmp;
}
}
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
static void add_syms(struct dso *p)
{
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
if (!p->syms_next && syms_tail != p) {
syms_tail->syms_next = p;
syms_tail = p;
}
}
static void revert_syms(struct dso *old_tail)
{
struct dso *p, *next;
/* Chop off the tail of the list of dsos that participate in
* the global symbol table, reverting them to RTLD_LOCAL. */
for (p=old_tail; p; p=next) {
next = p->syms_next;
p->syms_next = 0;
}
syms_tail = old_tail;
}
static void do_mips_relocs(struct dso *p, size_t *got)
{
size_t i, j, rel[2];
unsigned char *base = p->base;
i=0; search_vec(p->dynv, &i, DT_MIPS_LOCAL_GOTNO);
reprocess all libc/ldso symbolic relocations in dynamic linking stage 3 commit f3ddd173806fd5c60b3f034528ca24542aecc5b9 introduced early relocations and subsequent reprocessing as part of the dynamic linker bootstrap overhaul, to allow use of arbitrary libc functions before the main application and libraries are loaded, but only reprocessed GOT/PLT relocation types. commit c093e2e8201524db0d638920e76bcb6b1d925f3a added reprocessing of non-GOT/PLT relocations to fix an actual regression that was observed on powerpc, but only for RELA format tables with out-of-line addends. REL table (inline addends at the relocation address) reprocessing is trickier because the first relocation pass clobbers the addends. this patch extends symbolic relocation reprocessing for libc/ldso to support all relocation types, whether REL or RELA format tables are used. it is believed not to alter behavior on any existing archs for the current dynamic linker and libc code. the motivations for this change are consistency and future-proofing. it ensures that behavior does not differ depending on whether REL or RELA tables are used, which could lead to undetected arch-specific bugs. it also ensures that, if in the future code depending on additional relocation types is added to libc.so, either at the source level or as part of the compiler runtime that gets pulled in (for example, soft-float with TLS for fenv), the new code will work properly. the implementation concept is simple: stage 2 of the dynamic linker counts the number of symbolic relocations in the libc/ldso REL table and allocates a VLA to save their addends into; stage 3 then uses the saved addends in place of the inline ones which were clobbered. for stack safety, a hard limit (currently 4k) is imposed on the number of such addends; this should be a couple orders of magnitude larger than the actual need. this number is not a runtime variable that could break fail-safety; it is constant for a given libc.so build.
2015-05-26 06:33:59 +03:00
if (p==&ldso) {
got += i;
} else {
while (i--) *got++ += (size_t)base;
}
j=0; search_vec(p->dynv, &j, DT_MIPS_GOTSYM);
i=0; search_vec(p->dynv, &i, DT_MIPS_SYMTABNO);
Sym *sym = p->syms + j;
rel[0] = (unsigned char *)got - base;
for (i-=j; i; i--, sym++, rel[0]+=sizeof(size_t)) {
rel[1] = R_INFO(sym-p->syms, R_MIPS_JUMP_SLOT);
do_relocs(p, rel, sizeof rel, 2);
}
}
static void reloc_all(struct dso *p)
{
size_t dyn[DYN_CNT];
for (; p; p=p->next) {
if (p->relocated) continue;
decode_vec(p->dynv, dyn, DYN_CNT);
if (NEED_MIPS_GOT_RELOCS)
do_mips_relocs(p, laddr(p, dyn[DT_PLTGOT]));
do_relocs(p, laddr(p, dyn[DT_JMPREL]), dyn[DT_PLTRELSZ],
2+(dyn[DT_PLTREL]==DT_RELA));
do_relocs(p, laddr(p, dyn[DT_REL]), dyn[DT_RELSZ], 2);
do_relocs(p, laddr(p, dyn[DT_RELA]), dyn[DT_RELASZ], 3);
2014-03-25 16:13:27 +04:00
if (head != &ldso && p->relro_start != p->relro_end &&
mprotect(laddr(p, p->relro_start), p->relro_end-p->relro_start, PROT_READ)
&& errno != ENOSYS) {
error("Error relocating %s: RELRO protection failed: %m",
2014-03-25 16:13:27 +04:00
p->name);
if (runtime) longjmp(*rtld_fail, 1);
2014-03-25 16:13:27 +04:00
}
p->relocated = 1;
}
}
static void kernel_mapped_dso(struct dso *p)
{
size_t min_addr = -1, max_addr = 0, cnt;
Phdr *ph = p->phdr;
for (cnt = p->phnum; cnt--; ph = (void *)((char *)ph + p->phentsize)) {
if (ph->p_type == PT_DYNAMIC) {
p->dynv = laddr(p, ph->p_vaddr);
} else if (ph->p_type == PT_GNU_RELRO) {
2014-03-25 16:13:27 +04:00
p->relro_start = ph->p_vaddr & -PAGE_SIZE;
p->relro_end = (ph->p_vaddr + ph->p_memsz) & -PAGE_SIZE;
} else if (ph->p_type == PT_GNU_STACK) {
if (!runtime && ph->p_memsz > __default_stacksize) {
__default_stacksize =
ph->p_memsz < DEFAULT_STACK_MAX ?
ph->p_memsz : DEFAULT_STACK_MAX;
}
2014-03-25 16:13:27 +04:00
}
if (ph->p_type != PT_LOAD) continue;
if (ph->p_vaddr < min_addr)
min_addr = ph->p_vaddr;
if (ph->p_vaddr+ph->p_memsz > max_addr)
max_addr = ph->p_vaddr+ph->p_memsz;
}
min_addr &= -PAGE_SIZE;
max_addr = (max_addr + PAGE_SIZE-1) & -PAGE_SIZE;
p->map = p->base + min_addr;
p->map_len = max_addr - min_addr;
p->kernel_mapped = 1;
}
void __libc_exit_fini()
{
struct dso *p;
size_t dyn[DYN_CNT];
int self = __pthread_self()->tid;
/* Take both locks before setting shutting_down, so that
* either lock is sufficient to read its value. The lock
* order matches that in dlopen to avoid deadlock. */
pthread_rwlock_wrlock(&lock);
pthread_mutex_lock(&init_fini_lock);
shutting_down = 1;
pthread_rwlock_unlock(&lock);
for (p=fini_head; p; p=p->fini_next) {
while (p->ctor_visitor && p->ctor_visitor!=self)
pthread_cond_wait(&ctor_cond, &init_fini_lock);
if (!p->constructed) continue;
decode_vec(p->dynv, dyn, DYN_CNT);
if (dyn[0] & (1<<DT_FINI_ARRAY)) {
size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
while (n--) ((void (*)(void))*--fn)();
}
#ifndef NO_LEGACY_INITFINI
if ((dyn[0] & (1<<DT_FINI)) && dyn[DT_FINI])
fpaddr(p, dyn[DT_FINI])();
#endif
}
}
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
static struct dso **queue_ctors(struct dso *dso)
{
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
size_t cnt, qpos, spos, i;
struct dso *p, **queue, **stack;
if (ldd_mode) return 0;
/* Bound on queue size is the total number of indirect deps.
* If a bfs deps list was built, we can use it. Otherwise,
* bound by the total number of DSOs, which is always safe and
* is reasonable we use it (for main app at startup). */
if (dso->bfs_built) {
for (cnt=0; dso->deps[cnt]; cnt++)
dso->deps[cnt]->mark = 0;
cnt++; /* self, not included in deps */
} else {
for (cnt=0, p=head; p; cnt++, p=p->next)
p->mark = 0;
}
cnt++; /* termination slot */
if (dso==head && cnt <= countof(builtin_ctor_queue))
queue = builtin_ctor_queue;
else
queue = calloc(cnt, sizeof *queue);
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
if (!queue) {
error("Error allocating constructor queue: %m\n");
if (runtime) longjmp(*rtld_fail, 1);
return 0;
}
/* Opposite ends of the allocated buffer serve as an output queue
* and a working stack. Setup initial stack with just the argument
* dso and initial queue empty... */
stack = queue;
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
qpos = 0;
spos = cnt;
stack[--spos] = dso;
dso->next_dep = 0;
dso->mark = 1;
/* Then perform pseudo-DFS sort, but ignoring circular deps. */
while (spos<cnt) {
p = stack[spos++];
while (p->next_dep < p->ndeps_direct) {
if (p->deps[p->next_dep]->mark) {
p->next_dep++;
} else {
stack[--spos] = p;
p = p->deps[p->next_dep];
p->next_dep = 0;
p->mark = 1;
}
}
queue[qpos++] = p;
}
queue[qpos] = 0;
for (i=0; i<qpos; i++) queue[i]->mark = 0;
return queue;
}
static void do_init_fini(struct dso **queue)
{
struct dso *p;
size_t dyn[DYN_CNT], i;
int self = __pthread_self()->tid;
pthread_mutex_lock(&init_fini_lock);
for (i=0; (p=queue[i]); i++) {
while ((p->ctor_visitor && p->ctor_visitor!=self) || shutting_down)
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
pthread_cond_wait(&ctor_cond, &init_fini_lock);
if (p->ctor_visitor || p->constructed)
continue;
p->ctor_visitor = self;
decode_vec(p->dynv, dyn, DYN_CNT);
if (dyn[0] & ((1<<DT_FINI) | (1<<DT_FINI_ARRAY))) {
p->fini_next = fini_head;
fini_head = p;
}
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
pthread_mutex_unlock(&init_fini_lock);
#ifndef NO_LEGACY_INITFINI
if ((dyn[0] & (1<<DT_INIT)) && dyn[DT_INIT])
fpaddr(p, dyn[DT_INIT])();
#endif
if (dyn[0] & (1<<DT_INIT_ARRAY)) {
size_t n = dyn[DT_INIT_ARRAYSZ]/sizeof(size_t);
size_t *fn = laddr(p, dyn[DT_INIT_ARRAY]);
while (n--) ((void (*)(void))*fn++)();
}
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
pthread_mutex_lock(&init_fini_lock);
p->ctor_visitor = 0;
p->constructed = 1;
pthread_cond_broadcast(&ctor_cond);
}
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
pthread_mutex_unlock(&init_fini_lock);
}
void __libc_start_init(void)
{
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
do_init_fini(main_ctor_queue);
if (!__malloc_replaced && main_ctor_queue != builtin_ctor_queue)
free(main_ctor_queue);
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
main_ctor_queue = 0;
}
static void dl_debug_state(void)
{
}
weak_alias(dl_debug_state, _dl_debug_state);
void __init_tls(size_t *auxv)
{
}
static void update_tls_size()
{
libc.tls_cnt = tls_cnt;
libc.tls_align = tls_align;
libc.tls_size = ALIGN(
(1+tls_cnt) * sizeof(void *) +
tls_offset +
sizeof(struct pthread) +
tls_align * 2,
tls_align);
}
install dynamic tls synchronously at dlopen, streamline access previously, dynamic loading of new libraries with thread-local storage allocated the storage needed for all existing threads at load-time, precluding late failure that can't be handled, but left installation in existing threads to take place lazily on first access. this imposed an additional memory access and branch on every dynamic tls access, and imposed a requirement, which was not actually met, that the dynamic tlsdesc asm functions preserve all call-clobbered registers before calling C code to to install new dynamic tls on first access. the x86[_64] versions of this code wrongly omitted saving and restoring of fpu/vector registers, assuming the compiler would not generate anything using them in the called C code. the arm and aarch64 versions saved known existing registers, but failed to be future-proof against expansion of the register file. now that we track live threads in a list, it's possible to install the new dynamic tls for each thread at dlopen time. for the most part, synchronization is not needed, because if a thread has not synchronized with completion of the dlopen, there is no way it can meaningfully request access to a slot past the end of the old dtv, which remains valid for accessing slots which already existed. however, it is necessary to ensure that, if a thread sees its new dtv pointer, it sees correct pointers in each of the slots that existed prior to the dlopen. my understanding is that, on most real-world coherency architectures including all the ones we presently support, a built-in consume order guarantees this; however, don't rely on that. instead, the SYS_membarrier syscall is used to ensure that all threads see the stores to the slots of their new dtv prior to the installation of the new dtv. if it is not supported, the same is implemented in userspace via signals, using the same mechanism as __synccall. the __tls_get_addr function, variants, and dynamic tlsdesc asm functions are all updated to remove the fallback paths for claiming new dynamic tls, and are now all branch-free.
2019-02-18 07:22:27 +03:00
static void install_new_tls(void)
{
sigset_t set;
pthread_t self = __pthread_self(), td;
struct dso *dtv_provider = container_of(tls_tail, struct dso, tls);
uintptr_t (*newdtv)[tls_cnt+1] = (void *)dtv_provider->new_dtv;
install dynamic tls synchronously at dlopen, streamline access previously, dynamic loading of new libraries with thread-local storage allocated the storage needed for all existing threads at load-time, precluding late failure that can't be handled, but left installation in existing threads to take place lazily on first access. this imposed an additional memory access and branch on every dynamic tls access, and imposed a requirement, which was not actually met, that the dynamic tlsdesc asm functions preserve all call-clobbered registers before calling C code to to install new dynamic tls on first access. the x86[_64] versions of this code wrongly omitted saving and restoring of fpu/vector registers, assuming the compiler would not generate anything using them in the called C code. the arm and aarch64 versions saved known existing registers, but failed to be future-proof against expansion of the register file. now that we track live threads in a list, it's possible to install the new dynamic tls for each thread at dlopen time. for the most part, synchronization is not needed, because if a thread has not synchronized with completion of the dlopen, there is no way it can meaningfully request access to a slot past the end of the old dtv, which remains valid for accessing slots which already existed. however, it is necessary to ensure that, if a thread sees its new dtv pointer, it sees correct pointers in each of the slots that existed prior to the dlopen. my understanding is that, on most real-world coherency architectures including all the ones we presently support, a built-in consume order guarantees this; however, don't rely on that. instead, the SYS_membarrier syscall is used to ensure that all threads see the stores to the slots of their new dtv prior to the installation of the new dtv. if it is not supported, the same is implemented in userspace via signals, using the same mechanism as __synccall. the __tls_get_addr function, variants, and dynamic tlsdesc asm functions are all updated to remove the fallback paths for claiming new dynamic tls, and are now all branch-free.
2019-02-18 07:22:27 +03:00
struct dso *p;
size_t i, j;
size_t old_cnt = self->dtv[0];
__block_app_sigs(&set);
__tl_lock();
/* Copy existing dtv contents from all existing threads. */
for (i=0, td=self; !i || td!=self; i++, td=td->next) {
memcpy(newdtv+i, td->dtv,
(old_cnt+1)*sizeof(uintptr_t));
newdtv[i][0] = tls_cnt;
}
/* Install new dtls into the enlarged, uninstalled dtv copies. */
for (p=head; ; p=p->next) {
if (p->tls_id <= old_cnt) continue;
install dynamic tls synchronously at dlopen, streamline access previously, dynamic loading of new libraries with thread-local storage allocated the storage needed for all existing threads at load-time, precluding late failure that can't be handled, but left installation in existing threads to take place lazily on first access. this imposed an additional memory access and branch on every dynamic tls access, and imposed a requirement, which was not actually met, that the dynamic tlsdesc asm functions preserve all call-clobbered registers before calling C code to to install new dynamic tls on first access. the x86[_64] versions of this code wrongly omitted saving and restoring of fpu/vector registers, assuming the compiler would not generate anything using them in the called C code. the arm and aarch64 versions saved known existing registers, but failed to be future-proof against expansion of the register file. now that we track live threads in a list, it's possible to install the new dynamic tls for each thread at dlopen time. for the most part, synchronization is not needed, because if a thread has not synchronized with completion of the dlopen, there is no way it can meaningfully request access to a slot past the end of the old dtv, which remains valid for accessing slots which already existed. however, it is necessary to ensure that, if a thread sees its new dtv pointer, it sees correct pointers in each of the slots that existed prior to the dlopen. my understanding is that, on most real-world coherency architectures including all the ones we presently support, a built-in consume order guarantees this; however, don't rely on that. instead, the SYS_membarrier syscall is used to ensure that all threads see the stores to the slots of their new dtv prior to the installation of the new dtv. if it is not supported, the same is implemented in userspace via signals, using the same mechanism as __synccall. the __tls_get_addr function, variants, and dynamic tlsdesc asm functions are all updated to remove the fallback paths for claiming new dynamic tls, and are now all branch-free.
2019-02-18 07:22:27 +03:00
unsigned char *mem = p->new_tls;
for (j=0; j<i; j++) {
unsigned char *new = mem;
new += ((uintptr_t)p->tls.image - (uintptr_t)mem)
& (p->tls.align-1);
memcpy(new, p->tls.image, p->tls.len);
newdtv[j][p->tls_id] =
(uintptr_t)new + DTP_OFFSET;
mem += p->tls.size + p->tls.align;
}
if (p->tls_id == tls_cnt) break;
}
/* Broadcast barrier to ensure contents of new dtv is visible
* if the new dtv pointer is. The __membarrier function has a
* fallback emulation using signals for kernels that lack the
* feature at the syscall level. */
__membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0);
install dynamic tls synchronously at dlopen, streamline access previously, dynamic loading of new libraries with thread-local storage allocated the storage needed for all existing threads at load-time, precluding late failure that can't be handled, but left installation in existing threads to take place lazily on first access. this imposed an additional memory access and branch on every dynamic tls access, and imposed a requirement, which was not actually met, that the dynamic tlsdesc asm functions preserve all call-clobbered registers before calling C code to to install new dynamic tls on first access. the x86[_64] versions of this code wrongly omitted saving and restoring of fpu/vector registers, assuming the compiler would not generate anything using them in the called C code. the arm and aarch64 versions saved known existing registers, but failed to be future-proof against expansion of the register file. now that we track live threads in a list, it's possible to install the new dynamic tls for each thread at dlopen time. for the most part, synchronization is not needed, because if a thread has not synchronized with completion of the dlopen, there is no way it can meaningfully request access to a slot past the end of the old dtv, which remains valid for accessing slots which already existed. however, it is necessary to ensure that, if a thread sees its new dtv pointer, it sees correct pointers in each of the slots that existed prior to the dlopen. my understanding is that, on most real-world coherency architectures including all the ones we presently support, a built-in consume order guarantees this; however, don't rely on that. instead, the SYS_membarrier syscall is used to ensure that all threads see the stores to the slots of their new dtv prior to the installation of the new dtv. if it is not supported, the same is implemented in userspace via signals, using the same mechanism as __synccall. the __tls_get_addr function, variants, and dynamic tlsdesc asm functions are all updated to remove the fallback paths for claiming new dynamic tls, and are now all branch-free.
2019-02-18 07:22:27 +03:00
/* Install new dtv for each thread. */
for (j=0, td=self; !j || td!=self; j++, td=td->next) {
td->dtv = td->dtv_copy = newdtv[j];
}
__tl_unlock();
__restore_sigs(&set);
}
/* Stage 1 of the dynamic linker is defined in dlstart.c. It calls the
* following stage 2 and stage 3 functions via primitive symbolic lookup
* since it does not have access to their addresses to begin with. */
/* Stage 2 of the dynamic linker is called after relative relocations
* have been processed. It can make function calls to static functions
* and access string literals and static data, but cannot use extern
* symbols. Its job is to perform symbolic relocations on the dynamic
* linker itself, but some of the relocations performed may need to be
* replaced later due to copy relocations in the main program. */
hidden void __dls2(unsigned char *base, size_t *sp)
{
size_t *auxv;
for (auxv=sp+1+*sp+1; *auxv; auxv++);
auxv++;
if (DL_FDPIC) {
void *p1 = (void *)sp[-2];
void *p2 = (void *)sp[-1];
if (!p1) {
size_t aux[AUX_CNT];
decode_vec(auxv, aux, AUX_CNT);
if (aux[AT_BASE]) ldso.base = (void *)aux[AT_BASE];
else ldso.base = (void *)(aux[AT_PHDR] & -4096);
}
app_loadmap = p2 ? p1 : 0;
ldso.loadmap = p2 ? p2 : p1;
ldso.base = laddr(&ldso, 0);
} else {
ldso.base = base;
}
Ehdr *ehdr = (void *)ldso.base;
ldso.name = ldso.shortname = "libc.so";
ldso.phnum = ehdr->e_phnum;
ldso.phdr = laddr(&ldso, ehdr->e_phoff);
ldso.phentsize = ehdr->e_phentsize;
kernel_mapped_dso(&ldso);
decode_dyn(&ldso);
if (DL_FDPIC) makefuncdescs(&ldso);
reprocess all libc/ldso symbolic relocations in dynamic linking stage 3 commit f3ddd173806fd5c60b3f034528ca24542aecc5b9 introduced early relocations and subsequent reprocessing as part of the dynamic linker bootstrap overhaul, to allow use of arbitrary libc functions before the main application and libraries are loaded, but only reprocessed GOT/PLT relocation types. commit c093e2e8201524db0d638920e76bcb6b1d925f3a added reprocessing of non-GOT/PLT relocations to fix an actual regression that was observed on powerpc, but only for RELA format tables with out-of-line addends. REL table (inline addends at the relocation address) reprocessing is trickier because the first relocation pass clobbers the addends. this patch extends symbolic relocation reprocessing for libc/ldso to support all relocation types, whether REL or RELA format tables are used. it is believed not to alter behavior on any existing archs for the current dynamic linker and libc code. the motivations for this change are consistency and future-proofing. it ensures that behavior does not differ depending on whether REL or RELA tables are used, which could lead to undetected arch-specific bugs. it also ensures that, if in the future code depending on additional relocation types is added to libc.so, either at the source level or as part of the compiler runtime that gets pulled in (for example, soft-float with TLS for fenv), the new code will work properly. the implementation concept is simple: stage 2 of the dynamic linker counts the number of symbolic relocations in the libc/ldso REL table and allocates a VLA to save their addends into; stage 3 then uses the saved addends in place of the inline ones which were clobbered. for stack safety, a hard limit (currently 4k) is imposed on the number of such addends; this should be a couple orders of magnitude larger than the actual need. this number is not a runtime variable that could break fail-safety; it is constant for a given libc.so build.
2015-05-26 06:33:59 +03:00
/* Prepare storage for to save clobbered REL addends so they
* can be reused in stage 3. There should be very few. If
* something goes wrong and there are a huge number, abort
* instead of risking stack overflow. */
size_t dyn[DYN_CNT];
decode_vec(ldso.dynv, dyn, DYN_CNT);
size_t *rel = laddr(&ldso, dyn[DT_REL]);
reprocess all libc/ldso symbolic relocations in dynamic linking stage 3 commit f3ddd173806fd5c60b3f034528ca24542aecc5b9 introduced early relocations and subsequent reprocessing as part of the dynamic linker bootstrap overhaul, to allow use of arbitrary libc functions before the main application and libraries are loaded, but only reprocessed GOT/PLT relocation types. commit c093e2e8201524db0d638920e76bcb6b1d925f3a added reprocessing of non-GOT/PLT relocations to fix an actual regression that was observed on powerpc, but only for RELA format tables with out-of-line addends. REL table (inline addends at the relocation address) reprocessing is trickier because the first relocation pass clobbers the addends. this patch extends symbolic relocation reprocessing for libc/ldso to support all relocation types, whether REL or RELA format tables are used. it is believed not to alter behavior on any existing archs for the current dynamic linker and libc code. the motivations for this change are consistency and future-proofing. it ensures that behavior does not differ depending on whether REL or RELA tables are used, which could lead to undetected arch-specific bugs. it also ensures that, if in the future code depending on additional relocation types is added to libc.so, either at the source level or as part of the compiler runtime that gets pulled in (for example, soft-float with TLS for fenv), the new code will work properly. the implementation concept is simple: stage 2 of the dynamic linker counts the number of symbolic relocations in the libc/ldso REL table and allocates a VLA to save their addends into; stage 3 then uses the saved addends in place of the inline ones which were clobbered. for stack safety, a hard limit (currently 4k) is imposed on the number of such addends; this should be a couple orders of magnitude larger than the actual need. this number is not a runtime variable that could break fail-safety; it is constant for a given libc.so build.
2015-05-26 06:33:59 +03:00
size_t rel_size = dyn[DT_RELSZ];
size_t symbolic_rel_cnt = 0;
apply_addends_to = rel;
for (; rel_size; rel+=2, rel_size-=2*sizeof(size_t))
if (!IS_RELATIVE(rel[1], ldso.syms)) symbolic_rel_cnt++;
reprocess all libc/ldso symbolic relocations in dynamic linking stage 3 commit f3ddd173806fd5c60b3f034528ca24542aecc5b9 introduced early relocations and subsequent reprocessing as part of the dynamic linker bootstrap overhaul, to allow use of arbitrary libc functions before the main application and libraries are loaded, but only reprocessed GOT/PLT relocation types. commit c093e2e8201524db0d638920e76bcb6b1d925f3a added reprocessing of non-GOT/PLT relocations to fix an actual regression that was observed on powerpc, but only for RELA format tables with out-of-line addends. REL table (inline addends at the relocation address) reprocessing is trickier because the first relocation pass clobbers the addends. this patch extends symbolic relocation reprocessing for libc/ldso to support all relocation types, whether REL or RELA format tables are used. it is believed not to alter behavior on any existing archs for the current dynamic linker and libc code. the motivations for this change are consistency and future-proofing. it ensures that behavior does not differ depending on whether REL or RELA tables are used, which could lead to undetected arch-specific bugs. it also ensures that, if in the future code depending on additional relocation types is added to libc.so, either at the source level or as part of the compiler runtime that gets pulled in (for example, soft-float with TLS for fenv), the new code will work properly. the implementation concept is simple: stage 2 of the dynamic linker counts the number of symbolic relocations in the libc/ldso REL table and allocates a VLA to save their addends into; stage 3 then uses the saved addends in place of the inline ones which were clobbered. for stack safety, a hard limit (currently 4k) is imposed on the number of such addends; this should be a couple orders of magnitude larger than the actual need. this number is not a runtime variable that could break fail-safety; it is constant for a given libc.so build.
2015-05-26 06:33:59 +03:00
if (symbolic_rel_cnt >= ADDEND_LIMIT) a_crash();
size_t addends[symbolic_rel_cnt+1];
saved_addends = addends;
head = &ldso;
reloc_all(&ldso);
ldso.relocated = 0;
/* Call dynamic linker stage-2b, __dls2b, looking it up
* symbolically as a barrier against moving the address
* load across the above relocation processing. */
struct symdef dls2b_def = find_sym(&ldso, "__dls2b", 0);
if (DL_FDPIC) ((stage3_func)&ldso.funcdescs[dls2b_def.sym-ldso.syms])(sp, auxv);
else ((stage3_func)laddr(&ldso, dls2b_def.sym->st_value))(sp, auxv);
}
/* Stage 2b sets up a valid thread pointer, which requires relocations
* completed in stage 2, and on which stage 3 is permitted to depend.
* This is done as a separate stage, with symbolic lookup as a barrier,
* so that loads of the thread pointer and &errno can be pure/const and
* thereby hoistable. */
void __dls2b(size_t *sp, size_t *auxv)
{
/* Setup early thread pointer in builtin_tls for ldso/libc itself to
* use during dynamic linking. If possible it will also serve as the
* thread pointer at runtime. */
search_vec(auxv, &__hwcap, AT_HWCAP);
libc.auxv = auxv;
libc.tls_size = sizeof builtin_tls;
libc.tls_align = tls_align;
if (__init_tp(__copy_tls((void *)builtin_tls)) < 0) {
a_crash();
}
struct symdef dls3_def = find_sym(&ldso, "__dls3", 0);
if (DL_FDPIC) ((stage3_func)&ldso.funcdescs[dls3_def.sym-ldso.syms])(sp, auxv);
else ((stage3_func)laddr(&ldso, dls3_def.sym->st_value))(sp, auxv);
}
/* Stage 3 of the dynamic linker is called with the dynamic linker/libc
* fully functional. Its job is to load (if not already loaded) and
* process dependencies and relocations for the main application and
* transfer control to its entry point. */
void __dls3(size_t *sp, size_t *auxv)
{
static struct dso app, vdso;
size_t aux[AUX_CNT];
size_t i;
2011-08-16 08:24:36 +04:00
char *env_preload=0;
char *replace_argv0=0;
2012-08-26 01:31:59 +04:00
size_t vdso_base;
int argc = *sp;
char **argv = (void *)(sp+1);
char **argv_orig = argv;
add support for init/fini array in main program, and greatly simplify modern (4.7.x and later) gcc uses init/fini arrays, rather than the legacy _init/_fini function pasting and crtbegin/crtend ctors/dtors system, on most or all archs. some archs had already switched a long time ago. without following this change, global ctors/dtors will cease to work under musl when building with new gcc versions. the most surprising part of this patch is that it actually reduces the size of the init code, for both static and shared libc. this is achieved by (1) unifying the handling main program and shared libraries in the dynamic linker, and (2) eliminating the glibc-inspired rube goldberg machine for passing around init and fini function pointers. to clarify, some background: the function signature for __libc_start_main was based on glibc, as part of the original goal of being able to run some glibc-linked binaries. it worked by having the crt1 code, which is linked into every application, static or dynamic, obtain and pass pointers to the init and fini functions, which __libc_start_main is then responsible for using and recording for later use, as necessary. however, in neither the static-linked nor dynamic-linked case do we actually need crt1.o's help. with dynamic linking, all the pointers are available in the _DYNAMIC block. with static linking, it's safe to simply access the _init/_fini and __init_array_start, etc. symbols directly. obviously changing the __libc_start_main function signature in an incompatible way would break both old musl-linked programs and glibc-linked programs, so let's not do that. instead, the function can just ignore the information it doesn't need. new archs need not even provide the useless args in their versions of crt1.o. existing archs should continue to provide it as long as there is an interest in having newly-linked applications be able to run on old versions of musl; at some point in the future, this support can be removed.
2013-07-21 11:00:54 +04:00
char **envp = argv+argc+1;
/* Find aux vector just past environ[] and use it to initialize
* global data that may be needed before we can make syscalls. */
__environ = envp;
decode_vec(auxv, aux, AUX_CNT);
overhaul i386 syscall mechanism not to depend on external asm source this is the first part of a series of patches intended to make __syscall fully self-contained in the object file produced using syscall.h, which will make it possible for crt1 code to perform syscalls. the (confusingly named) i386 __vsyscall mechanism, which this commit removes, was introduced before the presence of a valid thread pointer was mandatory; back then the thread pointer was setup lazily only if threads were used. the intent was to be able to perform syscalls using the kernel's fast entry point in the VDSO, which can use the sysenter (Intel) or syscall (AMD) instruction instead of int $128, but without inlining an access to the __syscall global at the point of each syscall, which would incur a significant size cost from PIC setup everywhere. the mechanism also shuffled registers/calling convention around to avoid spills of call-saved registers, and to avoid allocating ebx or ebp via asm constraints, since there are plenty of broken-but-supported compiler versions which are incapable of allocating ebx with -fPIC or ebp with -fno-omit-frame-pointer. the new mechanism preserves the properties of avoiding spills and avoiding allocation of ebx/ebp in constraints, but does it inline, using some fairly simple register shuffling, and uses a field of the thread structure rather than global data for the vdso-provided syscall code address. for now, the external __syscall function is refactored not to use the old __vsyscall so it can be kept, but the intent is to remove it too.
2019-04-11 00:10:36 +03:00
search_vec(auxv, &__sysinfo, AT_SYSINFO);
__pthread_self()->sysinfo = __sysinfo;
libc.page_size = aux[AT_PAGESZ];
libc.secure = ((aux[0]&0x7800)!=0x7800 || aux[AT_UID]!=aux[AT_EUID]
|| aux[AT_GID]!=aux[AT_EGID] || aux[AT_SECURE]);
/* Only trust user/env if kernel says we're not suid/sgid */
if (!libc.secure) {
env_path = getenv("LD_LIBRARY_PATH");
env_preload = getenv("LD_PRELOAD");
}
/* If the main program was already loaded by the kernel,
* AT_PHDR will point to some location other than the dynamic
* linker's program headers. */
if (aux[AT_PHDR] != (size_t)ldso.phdr) {
size_t interp_off = 0;
size_t tls_image = 0;
/* Find load address of the main program, via AT_PHDR vs PT_PHDR. */
Phdr *phdr = app.phdr = (void *)aux[AT_PHDR];
app.phnum = aux[AT_PHNUM];
app.phentsize = aux[AT_PHENT];
for (i=aux[AT_PHNUM]; i; i--, phdr=(void *)((char *)phdr + aux[AT_PHENT])) {
if (phdr->p_type == PT_PHDR)
app.base = (void *)(aux[AT_PHDR] - phdr->p_vaddr);
else if (phdr->p_type == PT_INTERP)
interp_off = (size_t)phdr->p_vaddr;
else if (phdr->p_type == PT_TLS) {
tls_image = phdr->p_vaddr;
app.tls.len = phdr->p_filesz;
app.tls.size = phdr->p_memsz;
app.tls.align = phdr->p_align;
}
}
if (DL_FDPIC) app.loadmap = app_loadmap;
if (app.tls.size) app.tls.image = laddr(&app, tls_image);
if (interp_off) ldso.name = laddr(&app, interp_off);
if ((aux[0] & (1UL<<AT_EXECFN))
&& strncmp((char *)aux[AT_EXECFN], "/proc/", 6))
app.name = (char *)aux[AT_EXECFN];
else
app.name = argv[0];
kernel_mapped_dso(&app);
} else {
int fd;
char *ldname = argv[0];
size_t l = strlen(ldname);
if (l >= 3 && !strcmp(ldname+l-3, "ldd")) ldd_mode = 1;
argv++;
while (argv[0] && argv[0][0]=='-' && argv[0][1]=='-') {
char *opt = argv[0]+2;
*argv++ = (void *)-1;
if (!*opt) {
break;
} else if (!memcmp(opt, "list", 5)) {
ldd_mode = 1;
} else if (!memcmp(opt, "library-path", 12)) {
if (opt[12]=='=') env_path = opt+13;
else if (opt[12]) *argv = 0;
else if (*argv) env_path = *argv++;
} else if (!memcmp(opt, "preload", 7)) {
if (opt[7]=='=') env_preload = opt+8;
else if (opt[7]) *argv = 0;
else if (*argv) env_preload = *argv++;
} else if (!memcmp(opt, "argv0", 5)) {
if (opt[5]=='=') replace_argv0 = opt+6;
else if (opt[5]) *argv = 0;
else if (*argv) replace_argv0 = *argv++;
} else {
argv[0] = 0;
}
}
argv[-1] = (void *)(argc - (argv-argv_orig));
if (!argv[0]) {
dprintf(2, "musl libc (" LDSO_ARCH ")\n"
"Version %s\n"
"Dynamic Program Loader\n"
"Usage: %s [options] [--] pathname%s\n",
__libc_version, ldname,
ldd_mode ? "" : " [args]");
_exit(1);
}
fd = open(argv[0], O_RDONLY);
if (fd < 0) {
dprintf(2, "%s: cannot load %s: %s\n", ldname, argv[0], strerror(errno));
_exit(1);
}
Ehdr *ehdr = (void *)map_library(fd, &app);
if (!ehdr) {
dprintf(2, "%s: %s: Not a valid dynamic program\n", ldname, argv[0]);
_exit(1);
}
close(fd);
ldso.name = ldname;
app.name = argv[0];
aux[AT_ENTRY] = (size_t)laddr(&app, ehdr->e_entry);
/* Find the name that would have been used for the dynamic
* linker had ldd not taken its place. */
if (ldd_mode) {
for (i=0; i<app.phnum; i++) {
if (app.phdr[i].p_type == PT_INTERP)
ldso.name = laddr(&app, app.phdr[i].p_vaddr);
}
dprintf(1, "\t%s (%p)\n", ldso.name, ldso.base);
}
}
if (app.tls.size) {
libc.tls_head = tls_tail = &app.tls;
app.tls_id = tls_cnt = 1;
#ifdef TLS_ABOVE_TP
app.tls.offset = GAP_ABOVE_TP;
app.tls.offset += (-GAP_ABOVE_TP + (uintptr_t)app.tls.image)
& (app.tls.align-1);
tls_offset = app.tls.offset + app.tls.size;
#else
tls_offset = app.tls.offset = app.tls.size
+ ( -((uintptr_t)app.tls.image + app.tls.size)
& (app.tls.align-1) );
#endif
tls_align = MAXP2(tls_align, app.tls.align);
}
decode_dyn(&app);
if (DL_FDPIC) {
makefuncdescs(&app);
if (!app.loadmap) {
app.loadmap = (void *)&app_dummy_loadmap;
app.loadmap->nsegs = 1;
app.loadmap->segs[0].addr = (size_t)app.map;
app.loadmap->segs[0].p_vaddr = (size_t)app.map
- (size_t)app.base;
app.loadmap->segs[0].p_memsz = app.map_len;
}
argv[-3] = (void *)app.loadmap;
}
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
/* Initial dso chain consists only of the app. */
head = tail = syms_tail = &app;
/* Donate unused parts of app and library mapping to malloc */
reclaim_gaps(&app);
reclaim_gaps(&ldso);
/* Load preload/needed libraries, add symbols to global namespace. */
ldso.deps = (struct dso **)no_deps;
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
if (env_preload) load_preload(env_preload);
load_deps(&app);
for (struct dso *p=head; p; p=p->next)
add_syms(p);
/* Attach to vdso, if provided by the kernel, last so that it does
* not become part of the global namespace. */
if (search_vec(auxv, &vdso_base, AT_SYSINFO_EHDR) && vdso_base) {
Ehdr *ehdr = (void *)vdso_base;
Phdr *phdr = vdso.phdr = (void *)(vdso_base + ehdr->e_phoff);
vdso.phnum = ehdr->e_phnum;
vdso.phentsize = ehdr->e_phentsize;
for (i=ehdr->e_phnum; i; i--, phdr=(void *)((char *)phdr + ehdr->e_phentsize)) {
if (phdr->p_type == PT_DYNAMIC)
vdso.dynv = (void *)(vdso_base + phdr->p_offset);
if (phdr->p_type == PT_LOAD)
vdso.base = (void *)(vdso_base - phdr->p_vaddr + phdr->p_offset);
}
vdso.name = "";
vdso.shortname = "linux-gate.so.1";
vdso.relocated = 1;
vdso.deps = (struct dso **)no_deps;
decode_dyn(&vdso);
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
vdso.prev = tail;
tail->next = &vdso;
tail = &vdso;
}
for (i=0; app.dynv[i]; i+=2) {
if (!DT_DEBUG_INDIRECT && app.dynv[i]==DT_DEBUG)
app.dynv[i+1] = (size_t)&debug;
if (DT_DEBUG_INDIRECT && app.dynv[i]==DT_DEBUG_INDIRECT) {
size_t *ptr = (size_t *) app.dynv[i+1];
*ptr = (size_t)&debug;
}
}
2014-03-25 16:13:27 +04:00
/* This must be done before final relocations, since it calls
* malloc, which may be provided by the application. Calling any
* application code prior to the jump to its entry point is not
* valid in our model and does not work with FDPIC, where there
* are additional relocation-like fixups that only the entry point
* code can see to perform. */
main_ctor_queue = queue_ctors(&app);
/* Initial TLS must also be allocated before final relocations
* might result in calloc being a call to application code. */
update_tls_size();
void *initial_tls = builtin_tls;
if (libc.tls_size > sizeof builtin_tls || tls_align > MIN_TLS_ALIGN) {
initial_tls = calloc(libc.tls_size, 1);
always initialize thread pointer at program start this is the first step in an overhaul aimed at greatly simplifying and optimizing everything dealing with thread-local state. previously, the thread pointer was initialized lazily on first access, or at program startup if stack protector was in use, or at certain random places where inconsistent state could be reached if it were not initialized early. while believed to be fully correct, the logic was fragile and non-obvious. in the first phase of the thread pointer overhaul, support is retained (and in some cases improved) for systems/situation where loading the thread pointer fails, e.g. old kernels. some notes on specific changes: - the confusing use of libc.main_thread as an indicator that the thread pointer is initialized is eliminated in favor of an explicit has_thread_pointer predicate. - sigaction no longer needs to ensure that the thread pointer is initialized before installing a signal handler (this was needed to prevent a situation where the signal handler caused the thread pointer to be initialized and the subsequent sigreturn cleared it again) but it still needs to ensure that implementation-internal thread-related signals are not blocked. - pthread tsd initialization for the main thread is deferred in a new manner to minimize bloat in the static-linked __init_tp code. - pthread_setcancelstate no longer needs special handling for the situation before the thread pointer is initialized. it simply fails on systems that cannot support a thread pointer, which are non-conforming anyway. - pthread_cleanup_push/pop now check for missing thread pointer and nop themselves out in this case, so stdio no longer needs to avoid the cancellable path when the thread pointer is not available. a number of cases remain where certain interfaces may crash if the system does not support a thread pointer. at this point, these should be limited to pthread interfaces, and the number of such cases should be fewer than before.
2014-03-25 00:57:11 +04:00
if (!initial_tls) {
dprintf(2, "%s: Error getting %zu bytes thread-local storage: %m\n",
argv[0], libc.tls_size);
_exit(127);
}
}
static_tls_cnt = tls_cnt;
/* The main program must be relocated LAST since it may contain
* copy relocations which depend on libraries' relocations. */
reloc_all(app.next);
reloc_all(&app);
/* Actual copying to new TLS needs to happen after relocations,
* since the TLS images might have contained relocated addresses. */
if (initial_tls != builtin_tls) {
if (__init_tp(__copy_tls(initial_tls)) < 0) {
a_crash();
}
always initialize thread pointer at program start this is the first step in an overhaul aimed at greatly simplifying and optimizing everything dealing with thread-local state. previously, the thread pointer was initialized lazily on first access, or at program startup if stack protector was in use, or at certain random places where inconsistent state could be reached if it were not initialized early. while believed to be fully correct, the logic was fragile and non-obvious. in the first phase of the thread pointer overhaul, support is retained (and in some cases improved) for systems/situation where loading the thread pointer fails, e.g. old kernels. some notes on specific changes: - the confusing use of libc.main_thread as an indicator that the thread pointer is initialized is eliminated in favor of an explicit has_thread_pointer predicate. - sigaction no longer needs to ensure that the thread pointer is initialized before installing a signal handler (this was needed to prevent a situation where the signal handler caused the thread pointer to be initialized and the subsequent sigreturn cleared it again) but it still needs to ensure that implementation-internal thread-related signals are not blocked. - pthread tsd initialization for the main thread is deferred in a new manner to minimize bloat in the static-linked __init_tp code. - pthread_setcancelstate no longer needs special handling for the situation before the thread pointer is initialized. it simply fails on systems that cannot support a thread pointer, which are non-conforming anyway. - pthread_cleanup_push/pop now check for missing thread pointer and nop themselves out in this case, so stdio no longer needs to avoid the cancellable path when the thread pointer is not available. a number of cases remain where certain interfaces may crash if the system does not support a thread pointer. at this point, these should be limited to pthread interfaces, and the number of such cases should be fewer than before.
2014-03-25 00:57:11 +04:00
} else {
size_t tmp_tls_size = libc.tls_size;
pthread_t self = __pthread_self();
/* Temporarily set the tls size to the full size of
* builtin_tls so that __copy_tls will use the same layout
* as it did for before. Then check, just to be safe. */
libc.tls_size = sizeof builtin_tls;
if (__copy_tls((void*)builtin_tls) != self) a_crash();
libc.tls_size = tmp_tls_size;
}
if (ldso_fail) _exit(127);
if (ldd_mode) _exit(0);
/* Determine if malloc was interposed by a replacement implementation
* so that calloc and the memalign family can harden against the
* possibility of incomplete replacement. */
if (find_sym(head, "malloc", 1).dso != &ldso)
__malloc_replaced = 1;
if (find_sym(head, "aligned_alloc", 1).dso != &ldso)
__aligned_alloc_replaced = 1;
/* Switch to runtime mode: any further failures in the dynamic
* linker are a reportable failure rather than a fatal startup
* error. */
runtime = 1;
debug.ver = 1;
debug.bp = dl_debug_state;
debug.head = head;
debug.base = ldso.base;
debug.state = 0;
_dl_debug_state();
if (replace_argv0) argv[0] = replace_argv0;
add support for init/fini array in main program, and greatly simplify modern (4.7.x and later) gcc uses init/fini arrays, rather than the legacy _init/_fini function pasting and crtbegin/crtend ctors/dtors system, on most or all archs. some archs had already switched a long time ago. without following this change, global ctors/dtors will cease to work under musl when building with new gcc versions. the most surprising part of this patch is that it actually reduces the size of the init code, for both static and shared libc. this is achieved by (1) unifying the handling main program and shared libraries in the dynamic linker, and (2) eliminating the glibc-inspired rube goldberg machine for passing around init and fini function pointers. to clarify, some background: the function signature for __libc_start_main was based on glibc, as part of the original goal of being able to run some glibc-linked binaries. it worked by having the crt1 code, which is linked into every application, static or dynamic, obtain and pass pointers to the init and fini functions, which __libc_start_main is then responsible for using and recording for later use, as necessary. however, in neither the static-linked nor dynamic-linked case do we actually need crt1.o's help. with dynamic linking, all the pointers are available in the _DYNAMIC block. with static linking, it's safe to simply access the _init/_fini and __init_array_start, etc. symbols directly. obviously changing the __libc_start_main function signature in an incompatible way would break both old musl-linked programs and glibc-linked programs, so let's not do that. instead, the function can just ignore the information it doesn't need. new archs need not even provide the useless args in their versions of crt1.o. existing archs should continue to provide it as long as there is an interest in having newly-linked applications be able to run on old versions of musl; at some point in the future, this support can be removed.
2013-07-21 11:00:54 +04:00
errno = 0;
CRTJMP((void *)aux[AT_ENTRY], argv-1);
for(;;);
}
static void prepare_lazy(struct dso *p)
{
size_t dyn[DYN_CNT], n, flags1=0;
decode_vec(p->dynv, dyn, DYN_CNT);
search_vec(p->dynv, &flags1, DT_FLAGS_1);
if (dyn[DT_BIND_NOW] || (dyn[DT_FLAGS] & DF_BIND_NOW) || (flags1 & DF_1_NOW))
return;
n = dyn[DT_RELSZ]/2 + dyn[DT_RELASZ]/3 + dyn[DT_PLTRELSZ]/2 + 1;
if (NEED_MIPS_GOT_RELOCS) {
size_t j=0; search_vec(p->dynv, &j, DT_MIPS_GOTSYM);
size_t i=0; search_vec(p->dynv, &i, DT_MIPS_SYMTABNO);
n += i-j;
}
p->lazy = calloc(n, 3*sizeof(size_t));
if (!p->lazy) {
error("Error preparing lazy relocation for %s: %m", p->name);
longjmp(*rtld_fail, 1);
}
p->lazy_next = lazy_head;
lazy_head = p;
}
void *dlopen(const char *file, int mode)
{
struct dso *volatile p, *orig_tail, *orig_syms_tail, *orig_lazy_head, *next;
struct tls_module *orig_tls_tail;
size_t orig_tls_cnt, orig_tls_offset, orig_tls_align;
size_t i;
int cs;
jmp_buf jb;
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
struct dso **volatile ctor_queue = 0;
if (!file) return head;
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
pthread_rwlock_wrlock(&lock);
__inhibit_ptc();
p = 0;
if (shutting_down) {
error("Cannot dlopen while program is exiting.");
goto end;
}
orig_tls_tail = tls_tail;
orig_tls_cnt = tls_cnt;
orig_tls_offset = tls_offset;
orig_tls_align = tls_align;
orig_lazy_head = lazy_head;
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
orig_syms_tail = syms_tail;
orig_tail = tail;
noload = mode & RTLD_NOLOAD;
rtld_fail = &jb;
if (setjmp(*rtld_fail)) {
/* Clean up anything new that was (partially) loaded */
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
revert_syms(orig_syms_tail);
for (p=orig_tail->next; p; p=next) {
next = p->next;
while (p->td_index) {
void *tmp = p->td_index->next;
free(p->td_index);
p->td_index = tmp;
}
free(p->funcdescs);
if (p->rpath != p->rpath_orig)
free(p->rpath);
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
free(p->deps);
unmap_library(p);
free(p);
}
free(ctor_queue);
ctor_queue = 0;
if (!orig_tls_tail) libc.tls_head = 0;
tls_tail = orig_tls_tail;
if (tls_tail) tls_tail->next = 0;
tls_cnt = orig_tls_cnt;
tls_offset = orig_tls_offset;
tls_align = orig_tls_align;
lazy_head = orig_lazy_head;
tail = orig_tail;
tail->next = 0;
p = 0;
goto end;
} else p = load_library(file, head);
if (!p) {
error(noload ?
"Library %s is not already loaded" :
"Error loading shared library %s: %m",
file);
goto end;
}
/* First load handling */
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
load_deps(p);
extend_bfs_deps(p);
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
pthread_mutex_lock(&init_fini_lock);
if (!p->constructed) ctor_queue = queue_ctors(p);
pthread_mutex_unlock(&init_fini_lock);
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (!p->relocated && (mode & RTLD_LAZY)) {
prepare_lazy(p);
for (i=0; p->deps[i]; i++)
if (!p->deps[i]->relocated)
prepare_lazy(p->deps[i]);
}
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (!p->relocated || (mode & RTLD_GLOBAL)) {
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
/* Make new symbols global, at least temporarily, so we can do
* relocations. If not RTLD_GLOBAL, this is reverted below. */
add_syms(p);
for (i=0; p->deps[i]; i++)
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
add_syms(p->deps[i]);
}
fix and overhaul dlsym depedency order, always record direct deps dlsym with an explicit handle is specified to use "dependency order", a breadth-first search rooted at the argument. this has always been implemented by iterating a flattened dependency list built at dlopen time. however, the logic for building this list was completely wrong except in trivial cases; it simply used the list of libraries loaded since a given library, and their direct dependencies, as that library's dependencies, which could result in misordering, wrongful omission of deep dependencies from the search, and wrongful inclusion of unrelated libraries in the search. further, libraries did not have any recorded list of resolved dependencies until they were explicitly dlopened, meaning that DT_NEEDED entries had to be resolved again whenever a library participated as a dependency of more than one dlopened library. with this overhaul, the resolved direct dependency list of each library is always recorded when it is first loaded, and can be extended to a full flattened breadth-first search list if dlopen is called on the library. the extension is performed using the direct dependency list as a queue and appending copies of the direct dependency list of each dependency in the queue, excluding duplicates, until the end of the queue is reached. the direct deps remain available for future use as the initial subarray of the full deps array. first-load logic in dlopen is updated to match these changes, and clarified.
2019-02-27 02:05:19 +03:00
if (!p->relocated) {
reloc_all(p);
}
rework ldso handling of global symbol table for consistency when loading libraries with dlopen, the caller can request that the library's symbols become part of the global symbol table, or that they only be used for resolving relocations in the loaded library and its dependencies. in the latter case, a subsequent dlopen of the same library can upgrade it to global status. previously, if a library was upgraded from local to global mode, its symbols entered the symbol lookup search order at the point where the library was originally loaded. this means that a new call to dlopen could change the value of a symbol that already had a visible definition, an inconsistency which applications could observe. POSIX is unclear whether this should happen or whether it's permitted to happen, but the resolution of Austin Group issue #982 made it formally unspecified. with this patch, a library whose mode is upgraded from local to global enters the symbol lookup order at the point where it was made global, so that symbol resolution before and after the upgrade are consistent. in order to implement this change, the per-dso global flag is replaced with a separate set of linked-list pointers for participation in the global symbol table. this permits the order of dso objects for symbol resolution to differ from the order used for iteration of all loaded libraries. it also improves performance of find_sym, by avoiding a branch per iteration and skipping, and especially in the case where many non-global libraries have been loaded, by allowing the loop to skip over them entirely. logic for temporarily adding non-global libraries to the symbol table for relocation purposes is also mildly simplified.
2017-03-13 04:03:05 +03:00
/* If RTLD_GLOBAL was not specified, undo any new additions
* to the global symbol table. This is a nop if the library was
* previously loaded and already global. */
if (!(mode & RTLD_GLOBAL))
revert_syms(orig_syms_tail);
/* Processing of deferred lazy relocations must not happen until
* the new libraries are committed; otherwise we could end up with
* relocations resolved to symbol definitions that get removed. */
redo_lazy_relocs();
update_tls_size();
install dynamic tls synchronously at dlopen, streamline access previously, dynamic loading of new libraries with thread-local storage allocated the storage needed for all existing threads at load-time, precluding late failure that can't be handled, but left installation in existing threads to take place lazily on first access. this imposed an additional memory access and branch on every dynamic tls access, and imposed a requirement, which was not actually met, that the dynamic tlsdesc asm functions preserve all call-clobbered registers before calling C code to to install new dynamic tls on first access. the x86[_64] versions of this code wrongly omitted saving and restoring of fpu/vector registers, assuming the compiler would not generate anything using them in the called C code. the arm and aarch64 versions saved known existing registers, but failed to be future-proof against expansion of the register file. now that we track live threads in a list, it's possible to install the new dynamic tls for each thread at dlopen time. for the most part, synchronization is not needed, because if a thread has not synchronized with completion of the dlopen, there is no way it can meaningfully request access to a slot past the end of the old dtv, which remains valid for accessing slots which already existed. however, it is necessary to ensure that, if a thread sees its new dtv pointer, it sees correct pointers in each of the slots that existed prior to the dlopen. my understanding is that, on most real-world coherency architectures including all the ones we presently support, a built-in consume order guarantees this; however, don't rely on that. instead, the SYS_membarrier syscall is used to ensure that all threads see the stores to the slots of their new dtv prior to the installation of the new dtv. if it is not supported, the same is implemented in userspace via signals, using the same mechanism as __synccall. the __tls_get_addr function, variants, and dynamic tlsdesc asm functions are all updated to remove the fallback paths for claiming new dynamic tls, and are now all branch-free.
2019-02-18 07:22:27 +03:00
if (tls_cnt != orig_tls_cnt)
install_new_tls();
_dl_debug_state();
orig_tail = tail;
end:
__release_ptc();
if (p) gencnt++;
pthread_rwlock_unlock(&lock);
overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code.
2019-03-02 05:06:23 +03:00
if (ctor_queue) {
do_init_fini(ctor_queue);
free(ctor_queue);
}
pthread_setcancelstate(cs, 0);
return p;
}
hidden int __dl_invalid_handle(void *h)
{
struct dso *p;
for (p=head; p; p=p->next) if (h==p) return 0;
error("Invalid library handle %p", (void *)h);
return 1;
}
static void *addr2dso(size_t a)
{
struct dso *p;
size_t i;
if (DL_FDPIC) for (p=head; p; p=p->next) {
i = count_syms(p);
if (a-(size_t)p->funcdescs < i*sizeof(*p->funcdescs))
return p;
}
for (p=head; p; p=p->next) {
if (DL_FDPIC && p->loadmap) {
for (i=0; i<p->loadmap->nsegs; i++) {
if (a-p->loadmap->segs[i].p_vaddr
< p->loadmap->segs[i].p_memsz)
return p;
}
} else {
Phdr *ph = p->phdr;
size_t phcnt = p->phnum;
size_t entsz = p->phentsize;
size_t base = (size_t)p->base;
for (; phcnt--; ph=(void *)((char *)ph+entsz)) {
if (ph->p_type != PT_LOAD) continue;
if (a-base-ph->p_vaddr < ph->p_memsz)
return p;
}
if (a-(size_t)p->map < p->map_len)
return 0;
}
}
return 0;
}
static void *do_dlsym(struct dso *p, const char *s, void *ra)
{
int use_deps = 0;
if (p == head || p == RTLD_DEFAULT) {
p = head;
} else if (p == RTLD_NEXT) {
p = addr2dso((size_t)ra);
if (!p) p=head;
p = p->next;
} else if (__dl_invalid_handle(p)) {
return 0;
} else
use_deps = 1;
struct symdef def = find_sym2(p, s, 0, use_deps);
if (!def.sym) {
error("Symbol not found: %s", s);
return 0;
}
if ((def.sym->st_info&0xf) == STT_TLS)
return __tls_get_addr((tls_mod_off_t []){def.dso->tls_id, def.sym->st_value-DTP_OFFSET});
if (DL_FDPIC && (def.sym->st_info&0xf) == STT_FUNC)
return def.dso->funcdescs + (def.sym - def.dso->syms);
return laddr(def.dso, def.sym->st_value);
}
int dladdr(const void *addr_arg, Dl_info *info)
{
size_t addr = (size_t)addr_arg;
struct dso *p;
Sym *sym, *bestsym;
uint32_t nsym;
char *strings;
size_t best = 0;
size_t besterr = -1;
pthread_rwlock_rdlock(&lock);
p = addr2dso(addr);
pthread_rwlock_unlock(&lock);
if (!p) return 0;
sym = p->syms;
strings = p->strings;
nsym = count_syms(p);
if (DL_FDPIC) {
size_t idx = (addr-(size_t)p->funcdescs)
/ sizeof(*p->funcdescs);
if (idx < nsym && (sym[idx].st_info&0xf) == STT_FUNC) {
best = (size_t)(p->funcdescs + idx);
bestsym = sym + idx;
besterr = 0;
}
}
if (!best) for (; nsym; nsym--, sym++) {
if (sym->st_value
&& (1<<(sym->st_info&0xf) & OK_TYPES)
&& (1<<(sym->st_info>>4) & OK_BINDS)) {
size_t symaddr = (size_t)laddr(p, sym->st_value);
if (symaddr > addr || symaddr <= best)
continue;
best = symaddr;
bestsym = sym;
besterr = addr - symaddr;
if (addr == symaddr)
break;
}
}
if (best && besterr > bestsym->st_size-1) {
best = 0;
bestsym = 0;
}
info->dli_fname = p->name;
info->dli_fbase = p->map;
if (!best) {
info->dli_sname = 0;
info->dli_saddr = 0;
return 1;
}
if (DL_FDPIC && (bestsym->st_info&0xf) == STT_FUNC)
best = (size_t)(p->funcdescs + (bestsym - p->syms));
info->dli_sname = strings + bestsym->st_name;
info->dli_saddr = (void *)best;
return 1;
}
hidden void *__dlsym(void *restrict p, const char *restrict s, void *restrict ra)
{
void *res;
pthread_rwlock_rdlock(&lock);
res = do_dlsym(p, s, ra);
pthread_rwlock_unlock(&lock);
return res;
}
hidden void *__dlsym_redir_time64(void *restrict p, const char *restrict s, void *restrict ra)
{
#if _REDIR_TIME64
const char *suffix, *suffix2 = "";
char redir[36];
/* Map the symbol name to a time64 version of itself according to the
* pattern used for naming the redirected time64 symbols. */
size_t l = strnlen(s, sizeof redir);
if (l<4 || l==sizeof redir) goto no_redir;
if (s[l-2]=='_' && s[l-1]=='r') {
l -= 2;
suffix2 = s+l;
}
if (l<4) goto no_redir;
if (!strcmp(s+l-4, "time")) suffix = "64";
else suffix = "_time64";
/* Use the presence of the remapped symbol name in libc to determine
* whether it's one that requires time64 redirection; replace if so. */
snprintf(redir, sizeof redir, "__%.*s%s%s", (int)l, s, suffix, suffix2);
if (find_sym(&ldso, redir, 1).sym) s = redir;
no_redir:
#endif
return __dlsym(p, s, ra);
}
int dl_iterate_phdr(int(*callback)(struct dl_phdr_info *info, size_t size, void *data), void *data)
{
struct dso *current;
struct dl_phdr_info info;
int ret = 0;
for(current = head; current;) {
info.dlpi_addr = (uintptr_t)current->base;
info.dlpi_name = current->name;
info.dlpi_phdr = current->phdr;
info.dlpi_phnum = current->phnum;
info.dlpi_adds = gencnt;
info.dlpi_subs = 0;
info.dlpi_tls_modid = current->tls_id;
info.dlpi_tls_data = current->tls.image;
ret = (callback)(&info, sizeof (info), data);
if (ret != 0) break;
pthread_rwlock_rdlock(&lock);
current = current->next;
pthread_rwlock_unlock(&lock);
}
return ret;
}
static void error(const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
if (!runtime) {
vdprintf(2, fmt, ap);
dprintf(2, "\n");
ldso_fail = 1;
va_end(ap);
return;
}
__dl_vseterr(fmt, ap);
va_end(ap);
}