711 files changed, 9383 insertions, 3117 deletions
diff --git a/COPYRIGHT b/COPYRIGHT
index e6472371..2f15edc7 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -74,6 +74,7 @@ Kylie McClain
 Leah Neukirchen
 Luca Barbato
 Luka Perkov
+Lynn Ochs
 M Farkas-Dyck (Strake)
 Mahesh Bodapati
 Markus Wichmann
@@ -103,7 +104,6 @@ Stefan O'Rear
 Szabolcs Nagy
 Timo Teräs
 Trutz Behn
-Valentin Ochs
 Will Dietz
 William Haddon
 William Pitcock
@@ -127,10 +127,13 @@ Copyright © 2017-2018 Arm Limited
 and labelled as such in comments in the individual source files. All
 have been licensed under extremely permissive terms.
 
-The ARM memcpy code (src/string/arm/memcpy_el.S) is Copyright © 2008
+The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008
 The Android Open Source Project and is licensed under a two-clause BSD
 license. It was taken from Bionic libc, used on Android.
 
+The AArch64 memcpy and memset code (src/string/aarch64/*) are
+Copyright © 1999-2019, Arm Limited.
+
 The implementation of DES for crypt (src/crypt/crypt_des.c) is
 Copyright © 1994 David Burren. It is licensed under a BSD license.
 
@@ -140,7 +143,7 @@ domain. The code also comes with a fallback permissive license for use
 in jurisdictions that may not recognize the public domain.
 
 The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011
-Valentin Ochs and is licensed under an MIT-style license.
+Lynn Ochs and is licensed under an MIT-style license.
 
 The x86_64 port was written by Nicholas J. Kain and is licensed under
 the standard MIT terms.
diff --git a/INSTALL b/INSTALL
index 22477b6b..ddbbbb2f 100644
--- a/INSTALL
+++ b/INSTALL
@@ -86,7 +86,7 @@ and ABI combinations:
 
 * SuperH (SH)
     * Standard ELF ABI or FDPIC ABI (shared-text without MMU)
-    * Little-endian by default; big-engian variant also supported
+    * Little-endian by default; big-endian variant also supported
     * Full FPU ABI or soft-float ABI is supported, but the
       single-precision-only FPU ABI is not
 
@@ -97,11 +97,16 @@ and ABI combinations:
 
 * OpenRISC 1000 (or1k)
 
-* RISC-V 64
+* RISC-V
+    * 32-bit and 64-bit
     * Little endian
     * Hard, soft, and hard-single/soft-double floating point ABIs
     * Standard ELF; no shared-text NOMMU support
 
+* LoongArch
+    * 64-bit ISA
+    * Hard, soft, and hard-single/soft-double floating point ABIs
+
 
 
 Build and Installation Procedure
diff --git a/Makefile b/Makefile
index bd8f5c38..3ad88b35 100644
--- a/Makefile
+++ b/Makefile
@@ -17,7 +17,8 @@ includedir = $(prefix)/include
 libdir = $(prefix)/lib
 syslibdir = /lib
 
-SRC_DIRS = $(addprefix $(srcdir)/,src/* crt ldso $(COMPAT_SRC_DIRS))
+MALLOC_DIR = mallocng
+SRC_DIRS = $(addprefix $(srcdir)/,src/* src/malloc/$(MALLOC_DIR) crt ldso $(COMPAT_SRC_DIRS))
 BASE_GLOBS = $(addsuffix /*.c,$(SRC_DIRS))
 ARCH_GLOBS = $(addsuffix /$(ARCH)/*.[csS],$(SRC_DIRS))
 BASE_SRCS = $(sort $(wildcard $(BASE_GLOBS)))
@@ -108,7 +109,7 @@ obj/src/internal/version.o obj/src/internal/version.lo: obj/src/internal/version
 
 obj/crt/rcrt1.o obj/ldso/dlstart.lo obj/ldso/dynlink.lo: $(srcdir)/src/internal/dynlink.h $(srcdir)/arch/$(ARCH)/reloc.h
 
-obj/crt/crt1.o obj/crt/scrt1.o obj/crt/rcrt1.o obj/ldso/dlstart.lo: $(srcdir)/arch/$(ARCH)/crt_arch.h
+obj/crt/crt1.o obj/crt/Scrt1.o obj/crt/rcrt1.o obj/ldso/dlstart.lo: $(srcdir)/arch/$(ARCH)/crt_arch.h
 
 obj/crt/rcrt1.o: $(srcdir)/ldso/dlstart.c
 
diff --git a/VERSION b/VERSION
index 3fe3e58a..c813fe11 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.1.24
+1.2.5
diff --git a/WHATSNEW b/WHATSNEW
index f9879bd5..7bd90728 100644
--- a/WHATSNEW
+++ b/WHATSNEW
@@ -2164,3 +2164,277 @@ arch-specific bugs fixed:
 - riscv64 atomics had broken asm constraints (missing earlyclobber flag)
 - arm clone() was broken when compiled as thumb if start function returned
 - mipsr6 setjmp/longjmp did not preserve fpu register state correctly
+
+
+
+1.2.0 release notes
+
+new features:
+- time_t is now 64-bit on all archs (not just 64-bit archs)
+- character type & case mapping data updated to Unicode 12.1.0
+- header-level support for new linux features in 5.3 and 5.4
+
+performance:
+- new O(1) wchar_t case mapping implementation
+- i386 now uses C math code for exp, faster than old asm
+- mips math asm
+
+compatibility & conformance:
+- endian.h now aims to conform to future POSIX definition
+- support older compilers that don't accept powerpc math asm constraints
+- fdpic code in ldso was incompatible with valid optimizations in gcc 9+
+- RLIMIT_RTTIME was missing from sys/resource.h
+
+bugs fixed:
+- wcwidth wrongly returned 0 for most of planes 4 and up
+- missing case mapping between U+03F3 and U+037F
+- wrong cacosh results for arguments with negative imaginary part
+- wrong catanf/catanl results for various classes of arguments
+- wrong return value for ungetc with argument outside [0,UCHAR_MAX]
+- posix_openpt with no ptys available produced wrong errno
+
+arch-specific bugs fixed:
+- sigcontext/regset definition mistakes & omissions on m68k, powerpc64
+- fesetenv(FE_DFL_ENV) crashed on riscv64
+- sh2 dynamic linker was broken since 1.1.21 (crash in stage 2b)
+- arm dynamic linker chose wrong tls/atomic variants since 1.1.21
+- some math library functions returned excess precision on i386
+- unconfirmed regression in fchmodat AT_SYMLINK_NOFOLLOW on mips*
+
+
+
+1.2.1 release notes
+
+major changes:
+- new malloc implementation (mallocng & overhauled bump allocator)
+
+new features:
+- DNS queries via res_* now set AD flag, report zone signedness (DNSSEC)
+- PTHREAD_NULL macro (POSIX-future)
+
+performance:
+- optimized memcpy and memset for aarch64
+- optimized memcpy for arm now supports big endian
+- optimized x86_64 remquol
+- improved strerror without linear search
+
+bugs fixed:
+- lock-skipping for processes that returned to single-threaded was wrong
+- AF_UNSPEC dns lookups mishandled single failure in paired A+AAAA
+- res_send and res_query returned wrong value on errors from nameserver
+- corrupted sysvipc timestamps on 32-bit archs with old kernels
+- incorrect parsing of timezone offsets after overly-long zone name
+- clock_adjtime was broken on 32-bit archs (time64)
+- pthread_kill as not async-signal-safe
+- pthread_cancel was not async-cancel-safe
+- large-ulp errors in various math functions in non-default rounding modes
+
+arch-specific bugs fixed:
+- arm clock_gettime was broken on some hw due to bad time64 vdso
+- m68k sqrtl lacked long double precision
+- mips* syscall mechanism regressions on older kernels
+- mips* had negated error codes for some syscalls (kernel bug)
+- mips* SIGEMT was wrongly called SIGSTKFLT
+- sh fesetround didn't work correctly on sh
+
+
+
+1.2.2 release notes
+
+major changes:
+- child restrictions lifted after fork of multithreaded parent
+
+new features:
+- _Fork function (POSIX-future)
+- reallocarray function (extension from OpenBSD, now widespread)
+- gettid function (kernel tid as supported concept)
+- SIGEV_THREAD_ID sigevent API (Linux extension)
+- tcgetwinsize and tcsetwinsize functions (POSIX-future)
+
+performance:
+- faster software sqrt on archs without native sqrt instruction
+
+compatibility:
+- realpath no longer depends on procfs availability & accuracy
+- time zone parser now always prefers 64-bit tables if present
+- crypt_blowfish now supports $2b$ prefix
+- res_query now reports errors via h_errno
+- set*id and setrlimit are now safe in vforked/cloned child
+- setgroups now applies to all threads
+- dlopen debugger notification is improved, should work with lldb
+- setrlimit no longer needs __synccall broadcast on linux 2.6.36+
+- faccessat with AT_EACCESS no longer needs child process on linux 5.8+
+
+bugs fixed:
+- buffer overflow and infinite loop errors in wcsnrtombs (CVE-2020-28928)
+- sem_close unmapped still-referenced semaphores
+- fork of process with active aio could deadlock or crash paren
+- pthread_cond_wait was broken with priority-inheritance mutex
+- getgrouplist wrongly failed when nscd reported an empty list
+- abort could leak modified SIGABRT disposition to fork or posix_spawn child
+- regression with mallocng: malloc_usable_size(0) crashed
+- readlink wrongly gave EINVAL on zero length dest buffer
+- sqrtl was severely inaccurate (not correctly rounded) on ldquad archs
+- assert failure wrongly flushed stdio (possible deadlock)
+- MUSL_LOCPATH search was broken with multiple components
+- missing newline in herror output
+- possible deadlock in pthread_exit with pshared mutex or barrier usage
+- pthread_mutexattr_getprotocol didn't read back protocol
+- v4l2 ioctl translation for pre-time64 kernels didn't work
+
+arch-specific bugs fixed:
+- x86_64 longjmp failed to handle 0 argument reliably
+- i386 __set_thread_area fallback for pre-2.6 kernels didn't work
+- missing O_LARGEFILE macro value on x86_64, x32, mips64
+- unpredictable s390x breakage from failure to preserve call-saved registers
+
+
+
+1.2.3 release notes
+
+new features:
+- qsort_r function (POSIX-future)
+- pthread_getname_np extension function
+- hard float on SPE FPU for powerpc-sf
+- SEEK_DATA and SEEK_HOLE exposed in unistd.h (Linux extensions)
+
+compatibility:
+- free now preserves errno (POSIX-future requirement)
+- setjmp is declared explicitly with returns_twice for non-GCC compilers
+- macro version of isascii is no longer defined for C++
+- dynamic linker now tolerates zero-length LOAD segments
+- epoll_[p]wait is now a cancellation point
+- pwd/grp functions no longer fail on systems without AF_UNIX support
+- POSIX TZ parsing is stricter to allow more names to fallback to files
+- NULL is now defined as nullptr when used in C++11 or later
+- gettext now accepts null pointer as argument
+
+bugs fixed:
+- old regression in wcwidth of Hangul combining (vowel/final) letters
+- duplocale used wrong malloc when malloc was replaced (1.2.2 regression)
+- fmaf rounded wrong on archs without FE_TOWARDZERO (all softfloat archs)
+- popen didn't honor requirement not to leak other popen pipe fds to child
+- aligned_alloc and variants crashed on allocation failure
+- dl_iterate_phdr reported incorrect module TLS pointers
+- mishandling of some inputs in acoshf and expm1f and functions using them
+- potentially wrong-sign zero in cproj functions at infinity
+- multiple bugs in legacy function cuserid
+- minor posix_spawn file actions API conformance issues
+- pthread_setname_np fd leak
+- out-of-bound read in zoneinfo handling with distant-past times
+- out-of-tree builds lacked generated debug cfi for x86 asm
+
+arch-specific bugs fixed:
+- powerpc (32-bit) struct shmid_ds layout was wrong for some fields
+- time64 struct layout was wrong in sound ioctl fallback (32-bit archs)
+
+
+
+1.2.4 release notes
+
+new features:
+- large dns record lookups via tcp fallback
+- new getaddrinfo EAI_NODATA result to distinguish NODATA/NxDomain
+- support for new RELR compressed format for relative relocations
+- sysconf keys for querying signal stack size requirements
+- real vfork on riscv64
+
+performance:
+- mallocng no longer uses MADV_FREE (high performance cost, little gain)
+- vdso clock_gettime is supported once again on 32-bit arm
+
+compatibility:
+- gethostbyname family now distinguishes NO_DATA from HOST_NOT_FOUND
+- res_send now works with caller-provided edns0 queries
+- arpa/nameser.h RR types list is now up-to-date
+- previously-missing POSIX confstr keys have been added
+- mntent interfaces now accept missing fields
+- alt signal stack, if any, is now used for internal signals
+- the LFS64 macros are no longer exposed without _LARGEFILE64_SOURCE
+- memmem (POSIX-future) is now exposed in default feature profile
+- pthread_atfork now admits calls from an application-provided malloc
+- debugger tracking of shared libraries now works on MIPS PIE binaries
+- sendmsg now supports up to SCM_MAX_FD fds in SCM_RIGHTS messages
+
+bugs fixed:
+- gethostbyname[2]_r wrongly returned nonzero (error) on negative result
+- parallel v4/v6 address queries could fail on query id collisions
+- spurious getaddrinfo/AI_ADDRCONFIG failures due to errno clobbering
+- dns search domains ending in dot (including lone dot) broke lookups
+- ipv6 servers in resolv.conf broke lookups on systems with v6 disabled
+- systems with bindv6only failed to query both v4 and v6 nameservers
+- res_mkquery mishandled consecutive final dots in name
+- res_send could malfunction for very small answer buffer sizes
+- resolver dns backend accepted answers with wrong (A vs AAAA) RR type
+- getservbyport_r returned junk or ENOENT (vs ERANGE) on buffer size errors
+- dns result parsing of malformed responses could process uninitialized data
+- freopen didn't reset stream orientation (byte/wide) & encoding rule
+- fwprintf didn't print most fields on open_wmemstream FILEs
+- wide printf %lc ignored field width
+- wide printf erroneously processed %n after encoding errors
+- use of wide printf %9$ argument slot overflowed undersized buffer
+- swprintf malfunctioned on nul character in output
+- strverscmp ordered digit sequences vs nondigits incorrectly
+- timer_create/SIGEV_THREAD failure leaked the thread
+- semaphores were subject to missed-wake under certain usage patterns
+- several possible rare deadlocks with lock handling at thread exit
+- several possible rare deadlocks with aio and multithreaded fork
+- dynamic linker relro processing was broken on archs w/variable pagesize
+- async cancellation could run cancellation handlers in invalid context
+- pthread_detach was wrongly a cancellation point in rare race code path
+- use-after-close/double-close errors in mq_notify error paths
+- mq_notify event thread wrongly ran with signals unmasked
+- wcs{,n}cmp, wmemcmp returned wrong results when difference overflowed
+- accept4, pipe2, and dup3 handled unknown flags wrong in fallback cases
+- CPU_SETSIZE macro had wrong unit
+- select fallback for pre-time64 kernels truncated timeout (vs clamping)
+
+arch-specific bugs fixed:
+- x32 new socketcalls took fallback path due to pointer sign extension
+- x32 wait4 didn't fill rusage structure (time64 regression)
+- x32 semtimedop mismatched timespec ABI with kernel (time64 regression)
+- sigaction signal mask was bogus on or1k, microblaze, mips, and riscv
+- powerpc-sf longjmp asm clobbered value argument
+- or1k poll function passed timeout to syscall in wrong form
+
+
+
+1.2.5 release notes
+
+new features:
+- statx function (linux extension; via syscall and fallback using fstatat)
+- clone function is now usable and gives _Fork-like consistency in child
+- statvfs now provides f_type result
+- preadv2 and pwritev2 (linux extension) syscall wrappers
+- riscv64 TLSDESC support
+
+new ports:
+- loongarch64
+- riscv32
+
+compatibility:
+- DNS resolver can now handle answers with long CNAME chains
+- string.h no longer provides (C23-incompat) non-prototype decl of basename
+- fstatat statx backend now matches stat syscall non-automounting behavior
+- mntent interfaces now handle escaped whitespace in paths/options
+
+standards updates:
+- printf %lc of nul wchar now produces output
+- snprintf and swprintf no longer fail on n > INT_MAX
+- ppoll is now exposed in default feature profile
+
+bugs fixed:
+- some long DNS answers were wrongly rejected despite new TCP support
+- glob could wrongly return GLOB_NOMATCH if aborted before any matches
+- multithreaded set*id could malfunction from thread sequencing logic bug
+- certain use of threads after fork could deadlock thread-list lock
+- posix_spawn child could deadlock in race with async parent death
+- mbrtowc return value was wrong if argument n exceeded UINT_MAX
+- 80-bit extended acoshl and powl got some corner cases wrong
+- syslog incorrectly generated localized timestamps
+
+arch-specific bugs fixed:
+- arm (32-bit) TLSDESC malfunctioned due to addends being processed wrong
+- riscv64 icache flush operation was non-functional
+- sh sigsetjmp failed to properly restore call-saved register r8 on return
+- sh dlsym RTLD_NEXT did not identify calling module correctly
diff --git a/arch/aarch64/bits/hwcap.h b/arch/aarch64/bits/hwcap.h
index a7484028..424cc4d4 100644
--- a/arch/aarch64/bits/hwcap.h
+++ b/arch/aarch64/bits/hwcap.h
@@ -38,3 +38,15 @@
 #define HWCAP2_SVEBITPERM	(1 << 4)
 #define HWCAP2_SVESHA3		(1 << 5)
 #define HWCAP2_SVESM4		(1 << 6)
+#define HWCAP2_FLAGM2		(1 << 7)
+#define HWCAP2_FRINT		(1 << 8)
+#define HWCAP2_SVEI8MM		(1 << 9)
+#define HWCAP2_SVEF32MM		(1 << 10)
+#define HWCAP2_SVEF64MM		(1 << 11)
+#define HWCAP2_SVEBF16		(1 << 12)
+#define HWCAP2_I8MM		(1 << 13)
+#define HWCAP2_BF16		(1 << 14)
+#define HWCAP2_DGH		(1 << 15)
+#define HWCAP2_RNG		(1 << 16)
+#define HWCAP2_BTI		(1 << 17)
+#define HWCAP2_MTE		(1 << 18)
diff --git a/arch/aarch64/bits/mman.h b/arch/aarch64/bits/mman.h
new file mode 100644
index 00000000..8fad5ceb
--- /dev/null
+++ b/arch/aarch64/bits/mman.h
@@ -0,0 +1,2 @@
+#define PROT_BTI 0x10
+#define PROT_MTE 0x20
diff --git a/arch/aarch64/bits/posix.h b/arch/aarch64/bits/posix.h
deleted file mode 100644
index c37b94c1..00000000
--- a/arch/aarch64/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_LP64_OFF64  1
-#define _POSIX_V7_LP64_OFF64  1
diff --git a/arch/aarch64/bits/reg.h b/arch/aarch64/bits/reg.h
deleted file mode 100644
index 2633f39d..00000000
--- a/arch/aarch64/bits/reg.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
diff --git a/arch/aarch64/bits/signal.h b/arch/aarch64/bits/signal.h
index b71261f5..5098c734 100644
--- a/arch/aarch64/bits/signal.h
+++ b/arch/aarch64/bits/signal.h
@@ -11,7 +11,7 @@ typedef unsigned long greg_t;
 typedef unsigned long gregset_t[34];
 
 typedef struct {
-	long double vregs[32];
+	__uint128_t vregs[32];
 	unsigned int fpsr;
 	unsigned int fpcr;
 } fpregset_t;
@@ -34,7 +34,7 @@ struct fpsimd_context {
 	struct _aarch64_ctx head;
 	unsigned int fpsr;
 	unsigned int fpcr;
-	long double vregs[32];
+	__uint128_t vregs[32];
 };
 struct esr_context {
 	struct _aarch64_ctx head;
diff --git a/arch/aarch64/bits/stat.h b/arch/aarch64/bits/stat.h
deleted file mode 100644
index b7f4221b..00000000
--- a/arch/aarch64/bits/stat.h
+++ /dev/null
@@ -1,18 +0,0 @@
-struct stat {
-	dev_t st_dev;
-	ino_t st_ino;
-	mode_t st_mode;
-	nlink_t st_nlink;
-	uid_t st_uid;
-	gid_t st_gid;
-	dev_t st_rdev;
-	unsigned long __pad;
-	off_t st_size;
-	blksize_t st_blksize;
-	int __pad2;
-	blkcnt_t st_blocks;
-	struct timespec st_atim;
-	struct timespec st_mtim;
-	struct timespec st_ctim;
-	unsigned __unused[2];
-};
diff --git a/arch/aarch64/bits/stdint.h b/arch/aarch64/bits/stdint.h
deleted file mode 100644
index 1bb147f2..00000000
--- a/arch/aarch64/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT64_MIN
-#define INTPTR_MAX      INT64_MAX
-#define UINTPTR_MAX     UINT64_MAX
-#define PTRDIFF_MIN     INT64_MIN
-#define PTRDIFF_MAX     INT64_MAX
-#define SIZE_MAX        UINT64_MAX
diff --git a/arch/aarch64/bits/syscall.h.in b/arch/aarch64/bits/syscall.h.in
index 93648afd..ea5a152a 100644
--- a/arch/aarch64/bits/syscall.h.in
+++ b/arch/aarch64/bits/syscall.h.in
@@ -289,4 +289,19 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
diff --git a/arch/aarch64/bits/user.h b/arch/aarch64/bits/user.h
index d12cdf7f..8a1002aa 100644
--- a/arch/aarch64/bits/user.h
+++ b/arch/aarch64/bits/user.h
@@ -6,7 +6,7 @@ struct user_regs_struct {
 };
 
 struct user_fpsimd_struct {
-	long double vregs[32];
+	__uint128_t vregs[32];
 	unsigned int fpsr;
 	unsigned int fpcr;
 };
diff --git a/arch/aarch64/pthread_arch.h b/arch/aarch64/pthread_arch.h
index e64b126d..3909616c 100644
--- a/arch/aarch64/pthread_arch.h
+++ b/arch/aarch64/pthread_arch.h
@@ -1,12 +1,11 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	char *self;
-	__asm__ ("mrs %0,tpidr_el0" : "=r"(self));
-	return (void*)(self - sizeof(struct pthread));
+	uintptr_t tp;
+	__asm__ ("mrs %0,tpidr_el0" : "=r"(tp));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 16
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread))
 
 #define MC_PC pc
diff --git a/arch/arm/bits/posix.h b/arch/arm/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/arm/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/arm/bits/reg.h b/arch/arm/bits/reg.h
deleted file mode 100644
index 0c7bffca..00000000
--- a/arch/arm/bits/reg.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#undef __WORDSIZE
-#define __WORDSIZE 32
-/* FIXME */
diff --git a/arch/arm/bits/stdint.h b/arch/arm/bits/stdint.h
deleted file mode 100644
index d1b27121..00000000
--- a/arch/arm/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT32_MIN
-#define INTPTR_MAX      INT32_MAX
-#define UINTPTR_MAX     UINT32_MAX
-#define PTRDIFF_MIN     INT32_MIN
-#define PTRDIFF_MAX     INT32_MAX
-#define SIZE_MAX        UINT32_MAX
diff --git a/arch/arm/bits/syscall.h.in b/arch/arm/bits/syscall.h.in
index 11d67763..157b304d 100644
--- a/arch/arm/bits/syscall.h.in
+++ b/arch/arm/bits/syscall.h.in
@@ -389,6 +389,21 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
 #define __ARM_NR_breakpoint	0x0f0001
 #define __ARM_NR_cacheflush	0x0f0002
diff --git a/arch/arm/pthread_arch.h b/arch/arm/pthread_arch.h
index e689ea21..157e2eae 100644
--- a/arch/arm/pthread_arch.h
+++ b/arch/arm/pthread_arch.h
@@ -1,11 +1,11 @@
 #if ((__ARM_ARCH_6K__ || __ARM_ARCH_6KZ__ || __ARM_ARCH_6ZK__) && !__thumb__) \
  || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH >= 7
 
-static inline pthread_t __pthread_self()
+static inline uintptr_t __get_tp()
 {
-	char *p;
-	__asm__ ( "mrc p15,0,%0,c13,c0,3" : "=r"(p) );
-	return (void *)(p-sizeof(struct pthread));
+	uintptr_t tp;
+	__asm__ ( "mrc p15,0,%0,c13,c0,3" : "=r"(tp) );
+	return tp;
 }
 
 #else
@@ -16,18 +16,17 @@ static inline pthread_t __pthread_self()
 #define BLX "blx"
 #endif
 
-static inline pthread_t __pthread_self()
+static inline uintptr_t __get_tp()
 {
 	extern hidden uintptr_t __a_gettp_ptr;
-	register uintptr_t p __asm__("r0");
-	__asm__ ( BLX " %1" : "=r"(p) : "r"(__a_gettp_ptr) : "cc", "lr" );
-	return (void *)(p-sizeof(struct pthread));
+	register uintptr_t tp __asm__("r0");
+	__asm__ ( BLX " %1" : "=r"(tp) : "r"(__a_gettp_ptr) : "cc", "lr" );
+	return tp;
 }
 
 #endif
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 8
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread))
 
 #define MC_PC arm_pc
diff --git a/arch/arm/reloc.h b/arch/arm/reloc.h
index d091d2ad..d98eb8af 100644
--- a/arch/arm/reloc.h
+++ b/arch/arm/reloc.h
@@ -26,7 +26,7 @@
 #define REL_TPOFF       R_ARM_TLS_TPOFF32
 #define REL_TLSDESC     R_ARM_TLS_DESC
 
-#define TLSDESC_BACKWARDS
+#define TLSDESC_BACKWARDS 1
 
 #define CRTJMP(pc,sp) __asm__ __volatile__( \
 	"mov sp,%1 ; bx %0" : : "r"(pc), "r"(sp) : "memory" )
diff --git a/arch/arm/syscall_arch.h b/arch/arm/syscall_arch.h
index 4b08762d..624e992e 100644
--- a/arch/arm/syscall_arch.h
+++ b/arch/arm/syscall_arch.h
@@ -98,12 +98,13 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 	__asm_syscall(R7_OPERAND, "0"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4), "r"(r5));
 }
 
+#define SYSCALL_FADVISE_6_ARG
+
+#define SYSCALL_IPC_BROKEN_MODE
+
 #define VDSO_USEFUL
 #define VDSO_CGT32_SYM "__vdso_clock_gettime"
 #define VDSO_CGT32_VER "LINUX_2.6"
 #define VDSO_CGT_SYM "__vdso_clock_gettime64"
 #define VDSO_CGT_VER "LINUX_2.6"
-
-#define SYSCALL_FADVISE_6_ARG
-
-#define SYSCALL_IPC_BROKEN_MODE
+#define VDSO_CGT_WORKAROUND 1
diff --git a/arch/generic/bits/fcntl.h b/arch/generic/bits/fcntl.h
index ae233cc0..730a98cf 100644
--- a/arch/generic/bits/fcntl.h
+++ b/arch/generic/bits/fcntl.h
@@ -30,9 +30,15 @@
 #define F_SETSIG 10
 #define F_GETSIG 11
 
+#if __LONG_MAX == 0x7fffffffL
 #define F_GETLK 12
 #define F_SETLK 13
 #define F_SETLKW 14
+#else
+#define F_GETLK 5
+#define F_SETLK 6
+#define F_SETLKW 7
+#endif
 
 #define F_SETOWN_EX 15
 #define F_GETOWN_EX 16
diff --git a/arch/generic/bits/reg.h b/arch/generic/bits/reg.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/arch/generic/bits/reg.h
diff --git a/arch/riscv64/bits/stat.h b/arch/generic/bits/stat.h
index b7f4221b..f6d9e864 100644
--- a/arch/riscv64/bits/stat.h
+++ b/arch/generic/bits/stat.h
@@ -6,7 +6,7 @@ struct stat {
 	uid_t st_uid;
 	gid_t st_gid;
 	dev_t st_rdev;
-	unsigned long __pad;
+	unsigned long long __pad;
 	off_t st_size;
 	blksize_t st_blksize;
 	int __pad2;
diff --git a/arch/i386/bits/stdint.h b/arch/generic/bits/stdint.h
index d1b27121..86489187 100644
--- a/arch/i386/bits/stdint.h
+++ b/arch/generic/bits/stdint.h
@@ -12,9 +12,18 @@ typedef uint32_t uint_fast32_t;
 #define UINT_FAST16_MAX UINT32_MAX
 #define UINT_FAST32_MAX UINT32_MAX
 
+#if __LONG_MAX == 0x7fffffffL
 #define INTPTR_MIN      INT32_MIN
 #define INTPTR_MAX      INT32_MAX
 #define UINTPTR_MAX     UINT32_MAX
 #define PTRDIFF_MIN     INT32_MIN
 #define PTRDIFF_MAX     INT32_MAX
 #define SIZE_MAX        UINT32_MAX
+#else
+#define INTPTR_MIN      INT64_MIN
+#define INTPTR_MAX      INT64_MAX
+#define UINTPTR_MAX     UINT64_MAX
+#define PTRDIFF_MIN     INT64_MIN
+#define PTRDIFF_MAX     INT64_MAX
+#define SIZE_MAX        UINT64_MAX
+#endif
diff --git a/arch/i386/bits/posix.h b/arch/i386/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/i386/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/i386/bits/reg.h b/arch/i386/bits/reg.h
index 8bc2582d..7dfe8250 100644
--- a/arch/i386/bits/reg.h
+++ b/arch/i386/bits/reg.h
@@ -1,5 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 32
 #define EBX 0
 #define ECX 1
 #define EDX 2
diff --git a/arch/i386/bits/syscall.h.in b/arch/i386/bits/syscall.h.in
index 1ae4e48a..55e91cc4 100644
--- a/arch/i386/bits/syscall.h.in
+++ b/arch/i386/bits/syscall.h.in
@@ -426,4 +426,20 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_memfd_secret	447
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
diff --git a/arch/i386/crt_arch.h b/arch/i386/crt_arch.h
index 43c8477a..1a80fce3 100644
--- a/arch/i386/crt_arch.h
+++ b/arch/i386/crt_arch.h
@@ -3,6 +3,7 @@ __asm__(
 ".weak _DYNAMIC \n"
 ".hidden _DYNAMIC \n"
 ".global " START "\n"
+".type " START ",%function \n"
 START ":\n"
 "	xor %ebp,%ebp \n"
 "	mov %esp,%eax \n"
diff --git a/arch/i386/pthread_arch.h b/arch/i386/pthread_arch.h
index 6f600b9e..a639c382 100644
--- a/arch/i386/pthread_arch.h
+++ b/arch/i386/pthread_arch.h
@@ -1,10 +1,8 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
-	__asm__ ("movl %%gs:0,%0" : "=r" (self) );
-	return self;
+	uintptr_t tp;
+	__asm__ ("movl %%gs:0,%0" : "=r" (tp) );
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC gregs[REG_EIP]
diff --git a/arch/i386/syscall_arch.h b/arch/i386/syscall_arch.h
index 69642e57..f92b7aa9 100644
--- a/arch/i386/syscall_arch.h
+++ b/arch/i386/syscall_arch.h
@@ -87,5 +87,3 @@ static inline long __syscall6(long n, long a1, long a2, long a3, long a4, long a
 #define VDSO_CGT32_VER "LINUX_2.6"
 #define VDSO_CGT_SYM "__vdso_clock_gettime64"
 #define VDSO_CGT_VER "LINUX_2.6"
-
-#define SYSCALL_USE_SOCKETCALL
diff --git a/arch/loongarch64/atomic_arch.h b/arch/loongarch64/atomic_arch.h
new file mode 100644
index 00000000..2225d027
--- /dev/null
+++ b/arch/loongarch64/atomic_arch.h
@@ -0,0 +1,53 @@
+#define a_ll a_ll
+static inline int a_ll(volatile int *p)
+{
+	int v;
+	__asm__ __volatile__ (
+		"ll.w %0, %1"
+		: "=r"(v)
+		: "ZC"(*p));
+	return v;
+}
+
+#define a_sc a_sc
+static inline int a_sc(volatile int *p, int v)
+{
+	int r;
+	__asm__ __volatile__ (
+		"sc.w %0, %1"
+		: "=r"(r), "=ZC"(*p)
+		: "0"(v) : "memory");
+	return r;
+}
+
+#define a_ll_p a_ll_p
+static inline void *a_ll_p(volatile void *p)
+{
+	void *v;
+	__asm__ __volatile__ (
+		"ll.d %0, %1"
+		: "=r"(v)
+		: "ZC"(*(void *volatile *)p));
+	return v;
+}
+
+#define a_sc_p a_sc_p
+static inline int a_sc_p(volatile void *p, void *v)
+{
+	long r;
+	__asm__ __volatile__ (
+		"sc.d %0, %1"
+		: "=r"(r), "=ZC"(*(void *volatile *)p)
+		: "0"(v)
+		: "memory");
+	return r;
+}
+
+#define a_barrier a_barrier
+static inline void a_barrier()
+{
+	__asm__ __volatile__ ("dbar 0" : : : "memory");
+}
+
+#define a_pre_llsc  a_barrier
+#define a_post_llsc a_barrier
diff --git a/arch/loongarch64/bits/alltypes.h.in b/arch/loongarch64/bits/alltypes.h.in
new file mode 100644
index 00000000..d1807aca
--- /dev/null
+++ b/arch/loongarch64/bits/alltypes.h.in
@@ -0,0 +1,18 @@
+#define _Addr long
+#define _Int64 long
+#define _Reg long
+
+#define __BYTE_ORDER 1234
+#define __LONG_MAX 0x7fffffffffffffffL
+
+#ifndef __cplusplus
+TYPEDEF int wchar_t;
+#endif
+
+TYPEDEF float float_t;
+TYPEDEF double double_t;
+
+TYPEDEF struct { long long __ll; long double __ld; } max_align_t;
+
+TYPEDEF unsigned nlink_t;
+TYPEDEF int blksize_t;
diff --git a/arch/loongarch64/bits/fenv.h b/arch/loongarch64/bits/fenv.h
new file mode 100644
index 00000000..264cafb5
--- /dev/null
+++ b/arch/loongarch64/bits/fenv.h
@@ -0,0 +1,20 @@
+#define FE_INEXACT    0x010000
+#define FE_UNDERFLOW  0x020000
+#define FE_OVERFLOW   0x040000
+#define FE_DIVBYZERO  0x080000
+#define FE_INVALID    0x100000
+
+#define FE_ALL_EXCEPT 0x1F0000
+
+#define FE_TONEAREST  0x000
+#define FE_TOWARDZERO 0x100
+#define FE_UPWARD     0x200
+#define FE_DOWNWARD   0x300
+
+typedef unsigned fexcept_t;
+
+typedef struct {
+	unsigned __cw;
+} fenv_t;
+
+#define FE_DFL_ENV ((const fenv_t *) -1)
diff --git a/arch/loongarch64/bits/float.h b/arch/loongarch64/bits/float.h
new file mode 100644
index 00000000..719c7908
--- /dev/null
+++ b/arch/loongarch64/bits/float.h
@@ -0,0 +1,16 @@
+#define FLT_EVAL_METHOD 0
+
+#define LDBL_TRUE_MIN 6.47517511943802511092443895822764655e-4966L
+#define LDBL_MIN 3.36210314311209350626267781732175260e-4932L
+#define LDBL_MAX 1.18973149535723176508575932662800702e+4932L
+#define LDBL_EPSILON 1.92592994438723585305597794258492732e-34L
+
+#define LDBL_MANT_DIG 113
+#define LDBL_MIN_EXP (-16381)
+#define LDBL_MAX_EXP 16384
+
+#define LDBL_DIG 33
+#define LDBL_MIN_10_EXP (-4931)
+#define LDBL_MAX_10_EXP 4932
+
+#define DECIMAL_DIG 36
diff --git a/arch/loongarch64/bits/hwcap.h b/arch/loongarch64/bits/hwcap.h
new file mode 100644
index 00000000..b8184f69
--- /dev/null
+++ b/arch/loongarch64/bits/hwcap.h
@@ -0,0 +1,14 @@
+#define HWCAP_LOONGARCH_CPUCFG          (1 << 0)
+#define HWCAP_LOONGARCH_LAM             (1 << 1)
+#define HWCAP_LOONGARCH_UAL             (1 << 2)
+#define HWCAP_LOONGARCH_FPU             (1 << 3)
+#define HWCAP_LOONGARCH_LSX             (1 << 4)
+#define HWCAP_LOONGARCH_LASX            (1 << 5)
+#define HWCAP_LOONGARCH_CRC32           (1 << 6)
+#define HWCAP_LOONGARCH_COMPLEX         (1 << 7)
+#define HWCAP_LOONGARCH_CRYPTO          (1 << 8)
+#define HWCAP_LOONGARCH_LVZ             (1 << 9)
+#define HWCAP_LOONGARCH_LBT_X86         (1 << 10)
+#define HWCAP_LOONGARCH_LBT_ARM         (1 << 11)
+#define HWCAP_LOONGARCH_LBT_MIPS        (1 << 12)
+#define HWCAP_LOONGARCH_PTW             (1 << 13)
diff --git a/arch/loongarch64/bits/setjmp.h b/arch/loongarch64/bits/setjmp.h
new file mode 100644
index 00000000..3b15e87b
--- /dev/null
+++ b/arch/loongarch64/bits/setjmp.h
@@ -0,0 +1 @@
+typedef unsigned long __jmp_buf[23];
diff --git a/arch/loongarch64/bits/signal.h b/arch/loongarch64/bits/signal.h
new file mode 100644
index 00000000..5a9ed8c9
--- /dev/null
+++ b/arch/loongarch64/bits/signal.h
@@ -0,0 +1,101 @@
+#if defined(_POSIX_SOURCE) || defined(_POSIX_C_SOURCE) \
+ || defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+
+#if defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+#define MINSIGSTKSZ 4096
+#define SIGSTKSZ 16384
+#endif
+
+#if defined(_GNU_SOURCE)
+#define LARCH_NGREG 32
+#define LARCH_REG_RA 1
+#define LARCH_REG_SP 3
+#define LARCH_REG_S0 23
+#define LARCH_REG_S1 24
+#define LARCH_REG_A0 4
+#define LARCH_REG_S2 25
+#define LARCH_REG_NARGS 8
+#endif
+
+#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+typedef unsigned long greg_t, gregset_t[32];
+
+struct sigcontext {
+	unsigned long sc_pc;
+	unsigned long sc_regs[32];
+	unsigned sc_flags;
+	unsigned long sc_extcontext[] __attribute__((__aligned__(16)));
+};
+#endif
+
+typedef struct {
+	unsigned long __pc;
+	unsigned long __gregs[32];
+	unsigned __flags;
+	unsigned long __extcontext[] __attribute__((__aligned__(16)));
+} mcontext_t;
+
+struct sigaltstack {
+	void *ss_sp;
+	int ss_flags;
+	size_t ss_size;
+};
+
+typedef struct __ucontext
+{
+	unsigned long uc_flags;
+	struct __ucontext *uc_link;
+	stack_t uc_stack;
+	sigset_t uc_sigmask;
+	long __uc_pad;
+	mcontext_t uc_mcontext;
+} ucontext_t;
+
+#define __uc_flags uc_flags
+
+#define SA_NOCLDSTOP 1
+#define SA_NOCLDWAIT 2
+#define SA_SIGINFO   4
+#define SA_ONSTACK   0x08000000
+#define SA_RESTART   0x10000000
+#define SA_NODEFER   0x40000000
+#define SA_RESETHAND 0x80000000
+
+#endif
+
+#define SIGHUP     1
+#define SIGINT     2
+#define SIGQUIT    3
+#define SIGILL     4
+#define SIGTRAP    5
+#define SIGABRT    6
+#define SIGIOT     SIGABRT
+#define SIGBUS     7
+#define SIGFPE     8
+#define SIGKILL    9
+#define SIGUSR1   10
+#define SIGSEGV   11
+#define SIGUSR2   12
+#define SIGPIPE   13
+#define SIGALRM   14
+#define SIGTERM   15
+#define SIGSTKFLT 16
+#define SIGCHLD   17
+#define SIGCONT   18
+#define SIGSTOP   19
+#define SIGTSTP   20
+#define SIGTTIN   21
+#define SIGTTOU   22
+#define SIGURG    23
+#define SIGXCPU   24
+#define SIGXFSZ   25
+#define SIGVTALRM 26
+#define SIGPROF   27
+#define SIGWINCH  28
+#define SIGIO     29
+#define SIGPOLL   SIGIO
+#define SIGPWR    30
+#define SIGSYS    31
+#define SIGUNUSED SIGSYS
+
+#define _NSIG 65
diff --git a/arch/loongarch64/bits/syscall.h.in b/arch/loongarch64/bits/syscall.h.in
new file mode 100644
index 00000000..2afb4ea1
--- /dev/null
+++ b/arch/loongarch64/bits/syscall.h.in
@@ -0,0 +1,316 @@
+#define __NR_io_setup                   0
+#define __NR_io_destroy                 1
+#define __NR_io_submit                  2
+#define __NR_io_cancel                  3
+#define __NR_io_getevents               4
+#define __NR_setxattr                   5
+#define __NR_lsetxattr                  6
+#define __NR_fsetxattr                  7
+#define __NR_getxattr                   8
+#define __NR_lgetxattr                  9
+#define __NR_fgetxattr                  10
+#define __NR_listxattr                  11
+#define __NR_llistxattr                 12
+#define __NR_flistxattr                 13
+#define __NR_removexattr                14
+#define __NR_lremovexattr               15
+#define __NR_fremovexattr               16
+#define __NR_getcwd                     17
+#define __NR_lookup_dcookie             18
+#define __NR_eventfd2                   19
+#define __NR_epoll_create1              20
+#define __NR_epoll_ctl                  21
+#define __NR_epoll_pwait                22
+#define __NR_dup                        23
+#define __NR_dup3                       24
+#define __NR3264_fcntl                  25
+#define __NR_inotify_init1              26
+#define __NR_inotify_add_watch          27
+#define __NR_inotify_rm_watch           28
+#define __NR_ioctl                      29
+#define __NR_ioprio_set                 30
+#define __NR_ioprio_get                 31
+#define __NR_flock                      32
+#define __NR_mknodat                    33
+#define __NR_mkdirat                    34
+#define __NR_unlinkat                   35
+#define __NR_symlinkat                  36
+#define __NR_linkat                     37
+#define __NR_umount2                    39
+#define __NR_mount                      40
+#define __NR_pivot_root                 41
+#define __NR_nfsservctl                 42
+#define __NR3264_statfs                 43
+#define __NR3264_fstatfs                44
+#define __NR3264_truncate               45
+#define __NR3264_ftruncate              46
+#define __NR_fallocate                  47
+#define __NR_faccessat                  48
+#define __NR_chdir                      49
+#define __NR_fchdir                     50
+#define __NR_chroot                     51
+#define __NR_fchmod                     52
+#define __NR_fchmodat                   53
+#define __NR_fchownat                   54
+#define __NR_fchown                     55
+#define __NR_openat                     56
+#define __NR_close                      57
+#define __NR_vhangup                    58
+#define __NR_pipe2                      59
+#define __NR_quotactl                   60
+#define __NR_getdents64                 61
+#define __NR3264_lseek                  62
+#define __NR_read                       63
+#define __NR_write                      64
+#define __NR_readv                      65
+#define __NR_writev                     66
+#define __NR_pread64                    67
+#define __NR_pwrite64                   68
+#define __NR_preadv                     69
+#define __NR_pwritev                    70
+#define __NR3264_sendfile               71
+#define __NR_pselect6                   72
+#define __NR_ppoll                      73
+#define __NR_signalfd4                  74
+#define __NR_vmsplice                   75
+#define __NR_splice                     76
+#define __NR_tee                        77
+#define __NR_readlinkat                 78
+#define __NR_sync                       81
+#define __NR_fsync                      82
+#define __NR_fdatasync                  83
+#define __NR_sync_file_range            84
+#define __NR_timerfd_create             85
+#define __NR_timerfd_settime            86
+#define __NR_timerfd_gettime            87
+#define __NR_utimensat                  88
+#define __NR_acct                       89
+#define __NR_capget                     90
+#define __NR_capset                     91
+#define __NR_personality                92
+#define __NR_exit                       93
+#define __NR_exit_group                 94
+#define __NR_waitid                     95
+#define __NR_set_tid_address            96
+#define __NR_unshare                    97
+#define __NR_futex                      98
+#define __NR_set_robust_list            99
+#define __NR_get_robust_list            100
+#define __NR_nanosleep                  101
+#define __NR_getitimer                  102
+#define __NR_setitimer                  103
+#define __NR_kexec_load                 104
+#define __NR_init_module                105
+#define __NR_delete_module              106
+#define __NR_timer_create               107
+#define __NR_timer_gettime              108
+#define __NR_timer_getoverrun           109
+#define __NR_timer_settime              110
+#define __NR_timer_delete               111
+#define __NR_clock_settime              112
+#define __NR_clock_gettime              113
+#define __NR_clock_getres               114
+#define __NR_clock_nanosleep            115
+#define __NR_syslog                     116
+#define __NR_ptrace                     117
+#define __NR_sched_setparam             118
+#define __NR_sched_setscheduler         119
+#define __NR_sched_getscheduler         120
+#define __NR_sched_getparam             121
+#define __NR_sched_setaffinity          122
+#define __NR_sched_getaffinity          123
+#define __NR_sched_yield                124
+#define __NR_sched_get_priority_max     125
+#define __NR_sched_get_priority_min     126
+#define __NR_sched_rr_get_interval      127
+#define __NR_restart_syscall            128
+#define __NR_kill                       129
+#define __NR_tkill                      130
+#define __NR_tgkill                     131
+#define __NR_sigaltstack                132
+#define __NR_rt_sigsuspend              133
+#define __NR_rt_sigaction               134
+#define __NR_rt_sigprocmask             135
+#define __NR_rt_sigpending              136
+#define __NR_rt_sigtimedwait            137
+#define __NR_rt_sigqueueinfo            138
+#define __NR_rt_sigreturn               139
+#define __NR_setpriority                140
+#define __NR_getpriority                141
+#define __NR_reboot                     142
+#define __NR_setregid                   143
+#define __NR_setgid                     144
+#define __NR_setreuid                   145
+#define __NR_setuid                     146
+#define __NR_setresuid                  147
+#define __NR_getresuid                  148
+#define __NR_setresgid                  149
+#define __NR_getresgid                  150
+#define __NR_setfsuid                   151
+#define __NR_setfsgid                   152
+#define __NR_times                      153
+#define __NR_setpgid                    154
+#define __NR_getpgid                    155
+#define __NR_getsid                     156
+#define __NR_setsid                     157
+#define __NR_getgroups                  158
+#define __NR_setgroups                  159
+#define __NR_uname                      160
+#define __NR_sethostname                161
+#define __NR_setdomainname              162
+#define __NR_getrusage                  165
+#define __NR_umask                      166
+#define __NR_prctl                      167
+#define __NR_getcpu                     168
+#define __NR_gettimeofday               169
+#define __NR_settimeofday               170
+#define __NR_adjtimex                   171
+#define __NR_getpid                     172
+#define __NR_getppid                    173
+#define __NR_getuid                     174
+#define __NR_geteuid                    175
+#define __NR_getgid                     176
+#define __NR_getegid                    177
+#define __NR_gettid                     178
+#define __NR_sysinfo                    179
+#define __NR_mq_open                    180
+#define __NR_mq_unlink                  181
+#define __NR_mq_timedsend               182
+#define __NR_mq_timedreceive            183
+#define __NR_mq_notify                  184
+#define __NR_mq_getsetattr              185
+#define __NR_msgget                     186
+#define __NR_msgctl                     187
+#define __NR_msgrcv                     188
+#define __NR_msgsnd                     189
+#define __NR_semget                     190
+#define __NR_semctl                     191
+#define __NR_semtimedop                 192
+#define __NR_semop                      193
+#define __NR_shmget                     194
+#define __NR_shmctl                     195
+#define __NR_shmat                      196
+#define __NR_shmdt                      197
+#define __NR_socket                     198
+#define __NR_socketpair                 199
+#define __NR_bind                       200
+#define __NR_listen                     201
+#define __NR_accept                     202
+#define __NR_connect                    203
+#define __NR_getsockname                204
+#define __NR_getpeername                205
+#define __NR_sendto                     206
+#define __NR_recvfrom                   207
+#define __NR_setsockopt                 208
+#define __NR_getsockopt                 209
+#define __NR_shutdown                   210
+#define __NR_sendmsg                    211
+#define __NR_recvmsg                    212
+#define __NR_readahead                  213
+#define __NR_brk                        214
+#define __NR_munmap                     215
+#define __NR_mremap                     216
+#define __NR_add_key                    217
+#define __NR_request_key                218
+#define __NR_keyctl                     219
+#define __NR_clone                      220
+#define __NR_execve                     221
+#define __NR3264_mmap                   222
+#define __NR3264_fadvise64              223
+#define __NR_swapon                     224
+#define __NR_swapoff                    225
+#define __NR_mprotect                   226
+#define __NR_msync                      227
+#define __NR_mlock                      228
+#define __NR_munlock                    229
+#define __NR_mlockall                   230
+#define __NR_munlockall                 231
+#define __NR_mincore                    232
+#define __NR_madvise                    233
+#define __NR_remap_file_pages           234
+#define __NR_mbind                      235
+#define __NR_get_mempolicy              236
+#define __NR_set_mempolicy              237
+#define __NR_migrate_pages              238
+#define __NR_move_pages                 239
+#define __NR_rt_tgsigqueueinfo          240
+#define __NR_perf_event_open            241
+#define __NR_accept4                    242
+#define __NR_recvmmsg                   243
+#define __NR_arch_specific_syscall      244
+#define __NR_wait4                      260
+#define __NR_prlimit64                  261
+#define __NR_fanotify_init              262
+#define __NR_fanotify_mark              263
+#define __NR_name_to_handle_at          264
+#define __NR_open_by_handle_at          265
+#define __NR_clock_adjtime              266
+#define __NR_syncfs                     267
+#define __NR_setns                      268
+#define __NR_sendmmsg                   269
+#define __NR_process_vm_readv           270
+#define __NR_process_vm_writev          271
+#define __NR_kcmp                       272
+#define __NR_finit_module               273
+#define __NR_sched_setattr              274
+#define __NR_sched_getattr              275
+#define __NR_renameat2                  276
+#define __NR_seccomp                    277
+#define __NR_getrandom                  278
+#define __NR_memfd_create               279
+#define __NR_bpf                        280
+#define __NR_execveat                   281
+#define __NR_userfaultfd                282
+#define __NR_membarrier                 283
+#define __NR_mlock2                     284
+#define __NR_copy_file_range            285
+#define __NR_preadv2                    286
+#define __NR_pwritev2                   287
+#define __NR_pkey_mprotect              288
+#define __NR_pkey_alloc                 289
+#define __NR_pkey_free                  290
+#define __NR_statx                      291
+#define __NR_io_pgetevents              292
+#define __NR_rseq                       293
+#define __NR_kexec_file_load            294
+#define __NR_pidfd_send_signal          424
+#define __NR_io_uring_setup             425
+#define __NR_io_uring_enter             426
+#define __NR_io_uring_register          427
+#define __NR_open_tree                  428
+#define __NR_move_mount                 429
+#define __NR_fsopen                     430
+#define __NR_fsconfig                   431
+#define __NR_fsmount                    432
+#define __NR_fspick                     433
+#define __NR_pidfd_open                 434
+#define __NR_clone3                     435
+#define __NR_close_range                436
+#define __NR_openat2                    437
+#define __NR_pidfd_getfd                438
+#define __NR_faccessat2                 439
+#define __NR_process_madvise            440
+#define __NR_epoll_pwait2               441
+#define __NR_mount_setattr              442
+#define __NR_quotactl_fd                443
+#define __NR_landlock_create_ruleset    444
+#define __NR_landlock_add_rule          445
+#define __NR_landlock_restrict_self     446
+#define __NR_process_mrelease           448
+#define __NR_futex_waitv                449
+#define __NR_set_mempolicy_home_node    450
+#define __NR_cachestat                  451
+#define __NR_fchmodat2                  452
+#define __NR_map_shadow_stack           453
+#define __NR_futex_wake                 454
+#define __NR_futex_wait                 455
+#define __NR_futex_requeue              456
+#define __NR_fcntl                      __NR3264_fcntl
+#define __NR_statfs                     __NR3264_statfs
+#define __NR_fstatfs                    __NR3264_fstatfs
+#define __NR_truncate                   __NR3264_truncate
+#define __NR_ftruncate                  __NR3264_ftruncate
+#define __NR_lseek                      __NR3264_lseek
+#define __NR_sendfile                   __NR3264_sendfile
+#define __NR_mmap                       __NR3264_mmap
+#define __NR_fadvise64                  __NR3264_fadvise64
diff --git a/arch/loongarch64/bits/user.h b/arch/loongarch64/bits/user.h
new file mode 100644
index 00000000..fd9b7b22
--- /dev/null
+++ b/arch/loongarch64/bits/user.h
@@ -0,0 +1,24 @@
+#define ELF_NGREG    45
+#define ELF_NFPREG   34
+
+struct user_regs_struct {
+	unsigned long regs[32];
+	unsigned long orig_a0;
+	unsigned long csr_era;
+	unsigned long csr_badv;
+	unsigned long reserved[10];
+};
+
+struct user_fp_struct {
+	unsigned long fpr[32];
+	unsigned long fcc;
+	unsigned int fcsr;
+};
+
+typedef unsigned long elf_greg_t, elf_gregset_t[ELF_NGREG];
+
+typedef union {
+	double d;
+	float f;
+} elf_fpreg_t;
+typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
diff --git a/arch/loongarch64/crt_arch.h b/arch/loongarch64/crt_arch.h
new file mode 100644
index 00000000..e0760d9e
--- /dev/null
+++ b/arch/loongarch64/crt_arch.h
@@ -0,0 +1,13 @@
+__asm__(
+".text \n"
+".global " START "\n"
+".type   " START ", @function\n"
+START ":\n"
+"	move $fp, $zero\n"
+"	move $a0, $sp\n"
+".weak _DYNAMIC\n"
+".hidden _DYNAMIC\n"
+"	la.local $a1, _DYNAMIC\n"
+"	bstrins.d $sp, $zero, 3, 0\n"
+"	b " START "_c\n"
+);
diff --git a/arch/loongarch64/pthread_arch.h b/arch/loongarch64/pthread_arch.h
new file mode 100644
index 00000000..365f6ca8
--- /dev/null
+++ b/arch/loongarch64/pthread_arch.h
@@ -0,0 +1,11 @@
+static inline uintptr_t __get_tp()
+{
+	register uintptr_t tp __asm__("tp");
+	__asm__ ("" : "=r" (tp) );
+	return tp;
+}
+
+#define TLS_ABOVE_TP
+#define GAP_ABOVE_TP   0
+#define DTP_OFFSET     0
+#define MC_PC          __pc
diff --git a/arch/loongarch64/reloc.h b/arch/loongarch64/reloc.h
new file mode 100644
index 00000000..a4db6a9c
--- /dev/null
+++ b/arch/loongarch64/reloc.h
@@ -0,0 +1,30 @@
+#ifdef __loongarch_soft_float
+#define FP_SUFFIX "-sf"
+#elif defined __loongarch_single_float
+#define FP_SUFFIX "-sp"
+#else
+#define FP_SUFFIX ""
+#endif
+
+#define LDSO_ARCH "loongarch64" FP_SUFFIX
+
+#define TPOFF_K         0
+
+#define REL_PLT         R_LARCH_JUMP_SLOT
+#define REL_COPY        R_LARCH_COPY
+#define REL_DTPMOD      R_LARCH_TLS_DTPMOD64
+#define REL_DTPOFF      R_LARCH_TLS_DTPREL64
+#define REL_TPOFF       R_LARCH_TLS_TPREL64
+#define REL_RELATIVE    R_LARCH_RELATIVE
+#define REL_SYMBOLIC    R_LARCH_64
+#define REL_TLSDESC     R_LARCH_TLS_DESC64
+
+#define CRTJMP(pc,sp) __asm__ __volatile__( \
+	"move $sp, %1 ; jr %0" : : "r"(pc), "r"(sp) : "memory" )
+
+#define GETFUNCSYM(fp, sym, got) __asm__ ( \
+	".hidden " #sym "\n" \
+	".align 8 \n" \
+	"	la.local $t1, "#sym" \n" \
+	"	move %0, $t1 \n" \
+	: "=r"(*(fp)) : : "memory" )
diff --git a/arch/loongarch64/syscall_arch.h b/arch/loongarch64/syscall_arch.h
new file mode 100644
index 00000000..4d5e1885
--- /dev/null
+++ b/arch/loongarch64/syscall_arch.h
@@ -0,0 +1,137 @@
+#define __SYSCALL_LL_E(x) (x)
+#define __SYSCALL_LL_O(x) (x)
+
+#define SYSCALL_CLOBBERLIST \
+	"$t0", "$t1", "$t2", "$t3", \
+	"$t4", "$t5", "$t6", "$t7", "$t8", "memory"
+
+static inline long __syscall0(long n)
+{
+	register long a7 __asm__("$a7") = n;
+	register long a0 __asm__("$a0");
+
+	__asm__ __volatile__ (
+		"syscall 0"
+		: "=r"(a0)
+		: "r"(a7)
+		: SYSCALL_CLOBBERLIST);
+	return a0;
+}
+
+static inline long __syscall1(long n, long a)
+{
+	register long a7 __asm__("$a7") = n;
+	register long a0 __asm__("$a0") = a;
+
+	__asm__ __volatile__ (
+		"syscall 0"
+		: "+r"(a0)
+		: "r"(a7)
+		: SYSCALL_CLOBBERLIST);
+	return a0;
+}
+
+static inline long __syscall2(long n, long a, long b)
+{
+	register long a7 __asm__("$a7") = n;
+	register long a0 __asm__("$a0") = a;
+	register long a1 __asm__("$a1") = b;
+
+	__asm__ __volatile__ (
+		"syscall 0"
+		: "+r"(a0)
+	        : "r"(a7), "r"(a1)
+		: SYSCALL_CLOBBERLIST);
+	return a0;
+}
+
+static inline long __syscall3(long n, long a, long b, long c)
+{
+	register long a7 __asm__("$a7") = n;
+	register long a0 __asm__("$a0") = a;
+	register long a1 __asm__("$a1") = b;
+	register long a2 __asm__("$a2") = c;
+
+	__asm__ __volatile__ (
+		"syscall 0"
+		: "+r"(a0)
+	        : "r"(a7), "r"(a1), "r"(a2)
+		: SYSCALL_CLOBBERLIST);
+	return a0;
+}
+
+static inline long __syscall4(long n, long a, long b, long c, long d)
+{
+	register long a7 __asm__("$a7") = n;
+	register long a0 __asm__("$a0") = a;
+	register long a1 __asm__("$a1") = b;
+	register long a2 __asm__("$a2") = c;
+	register long a3 __asm__("$a3") = d;
+
+	__asm__ __volatile__ (
+		"syscall 0"
+		: "+r"(a0)
+	        : "r"(a7), "r"(a1), "r"(a2), "r"(a3)
+		: SYSCALL_CLOBBERLIST);
+	return a0;
+}
+
+static inline long __syscall5(long n, long a, long b, long c, long d, long e)
+{
+	register long a7 __asm__("$a7") = n;
+	register long a0 __asm__("$a0") = a;
+	register long a1 __asm__("$a1") = b;
+	register long a2 __asm__("$a2") = c;
+	register long a3 __asm__("$a3") = d;
+	register long a4 __asm__("$a4") = e;
+
+	__asm__ __volatile__ (
+		"syscall 0"
+		: "+r"(a0)
+	        : "r"(a7), "r"(a1), "r"(a2), "r"(a3), "r"(a4)
+		: SYSCALL_CLOBBERLIST);
+	return a0;
+}
+
+static inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
+{
+	register long a7 __asm__("$a7") = n;
+	register long a0 __asm__("$a0") = a;
+	register long a1 __asm__("$a1") = b;
+	register long a2 __asm__("$a2") = c;
+	register long a3 __asm__("$a3") = d;
+	register long a4 __asm__("$a4") = e;
+	register long a5 __asm__("$a5") = f;
+
+	__asm__ __volatile__ (
+		"syscall 0"
+		: "+r"(a0)
+	        : "r"(a7), "r"(a1), "r"(a2), "r"(a3), "r"(a4), "r"(a5)
+		: SYSCALL_CLOBBERLIST);
+	return a0;
+}
+
+static inline long __syscall7(long n, long a, long b, long c, long d, long e, long f, long g)
+{
+	register long a7 __asm__("$a7") = n;
+	register long a0 __asm__("$a0") = a;
+	register long a1 __asm__("$a1") = b;
+	register long a2 __asm__("$a2") = c;
+	register long a3 __asm__("$a3") = d;
+	register long a4 __asm__("$a4") = e;
+	register long a5 __asm__("$a5") = f;
+	register long a6 __asm__("$a6") = g;
+
+	__asm__ __volatile__ (
+		"syscall 0"
+		: "+r"(a0)
+	        : "r"(a7), "r"(a1), "r"(a2), "r"(a3), "r"(a4), "r"(a5), "r"(a6)
+		: SYSCALL_CLOBBERLIST);
+	return a0;
+}
+
+#define VDSO_USEFUL
+#define VDSO_CGT_SYM "__vdso_clock_gettime"
+#define VDSO_CGT_VER "LINUX_5.10"
+
+#define IPC_64  0
diff --git a/arch/m68k/bits/poll.h b/arch/m68k/bits/poll.h
new file mode 100644
index 00000000..00063f41
--- /dev/null
+++ b/arch/m68k/bits/poll.h
@@ -0,0 +1,2 @@
+#define POLLWRNORM POLLOUT
+#define POLLWRBAND 256
diff --git a/arch/m68k/bits/posix.h b/arch/m68k/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/m68k/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/m68k/bits/reg.h b/arch/m68k/bits/reg.h
index 99201f70..fedc4f9f 100644
--- a/arch/m68k/bits/reg.h
+++ b/arch/m68k/bits/reg.h
@@ -1,5 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 32
 #define PT_D1 0
 #define PT_D2 1
 #define PT_D3 2
diff --git a/arch/m68k/bits/stdint.h b/arch/m68k/bits/stdint.h
deleted file mode 100644
index d1b27121..00000000
--- a/arch/m68k/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT32_MIN
-#define INTPTR_MAX      INT32_MAX
-#define UINTPTR_MAX     UINT32_MAX
-#define PTRDIFF_MIN     INT32_MIN
-#define PTRDIFF_MAX     INT32_MAX
-#define SIZE_MAX        UINT32_MAX
diff --git a/arch/m68k/bits/syscall.h.in b/arch/m68k/bits/syscall.h.in
index ddfa72e4..5cd84602 100644
--- a/arch/m68k/bits/syscall.h.in
+++ b/arch/m68k/bits/syscall.h.in
@@ -405,3 +405,19 @@
 #define __NR_fsmount		432
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
+#define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
diff --git a/arch/m68k/pthread_arch.h b/arch/m68k/pthread_arch.h
index 02d5b8a0..5bea4e1a 100644
--- a/arch/m68k/pthread_arch.h
+++ b/arch/m68k/pthread_arch.h
@@ -1,13 +1,12 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	uintptr_t tp = __syscall(SYS_get_thread_area);
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return __syscall(SYS_get_thread_area);
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 #define MC_PC gregs[R_PC]
diff --git a/arch/m68k/syscall_arch.h b/arch/m68k/syscall_arch.h
index af79c306..6a9d0ae8 100644
--- a/arch/m68k/syscall_arch.h
+++ b/arch/m68k/syscall_arch.h
@@ -87,5 +87,4 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 	return d0;
 }
 
-#define SYSCALL_USE_SOCKETCALL
 #define SYSCALL_IPC_BROKEN_MODE
diff --git a/arch/microblaze/bits/posix.h b/arch/microblaze/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/microblaze/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/microblaze/bits/reg.h b/arch/microblaze/bits/reg.h
deleted file mode 100644
index 0c7bffca..00000000
--- a/arch/microblaze/bits/reg.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#undef __WORDSIZE
-#define __WORDSIZE 32
-/* FIXME */
diff --git a/arch/microblaze/bits/signal.h b/arch/microblaze/bits/signal.h
index 490f83bf..f25b7c6a 100644
--- a/arch/microblaze/bits/signal.h
+++ b/arch/microblaze/bits/signal.h
@@ -46,7 +46,6 @@ typedef struct __ucontext {
 #define SA_RESTART    0x10000000
 #define SA_NODEFER    0x40000000
 #define SA_RESETHAND  0x80000000
-#define SA_RESTORER   0x04000000
 
 #endif
 
diff --git a/arch/microblaze/bits/stdint.h b/arch/microblaze/bits/stdint.h
deleted file mode 100644
index d1b27121..00000000
--- a/arch/microblaze/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT32_MIN
-#define INTPTR_MAX      INT32_MAX
-#define UINTPTR_MAX     UINT32_MAX
-#define PTRDIFF_MIN     INT32_MIN
-#define PTRDIFF_MAX     INT32_MAX
-#define SIZE_MAX        UINT32_MAX
diff --git a/arch/microblaze/bits/syscall.h.in b/arch/microblaze/bits/syscall.h.in
index 963386a8..40860e6d 100644
--- a/arch/microblaze/bits/syscall.h.in
+++ b/arch/microblaze/bits/syscall.h.in
@@ -427,4 +427,19 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
diff --git a/arch/microblaze/pthread_arch.h b/arch/microblaze/pthread_arch.h
index f6ba8de9..ff26624e 100644
--- a/arch/microblaze/pthread_arch.h
+++ b/arch/microblaze/pthread_arch.h
@@ -1,10 +1,8 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
-	__asm__ ("ori %0, r21, 0" : "=r" (self) );
-	return self;
+	uintptr_t tp;
+	__asm__ ("ori %0, r21, 0" : "=r" (tp) );
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC regs.pc
diff --git a/arch/microblaze/syscall_arch.h b/arch/microblaze/syscall_arch.h
index 169013f8..61d8248e 100644
--- a/arch/microblaze/syscall_arch.h
+++ b/arch/microblaze/syscall_arch.h
@@ -95,3 +95,5 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 }
 
 #define SYSCALL_IPC_BROKEN_MODE
+
+#undef SYS_socketcall
diff --git a/arch/mips/bits/posix.h b/arch/mips/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/mips/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/mips/bits/reg.h b/arch/mips/bits/reg.h
index 0c370987..2611b632 100644
--- a/arch/mips/bits/reg.h
+++ b/arch/mips/bits/reg.h
@@ -1,6 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 32
-
 #define EF_R0 6
 #define EF_R1 7
 #define EF_R2 8
diff --git a/arch/mips/bits/signal.h b/arch/mips/bits/signal.h
index e1d97ac7..a3b3857a 100644
--- a/arch/mips/bits/signal.h
+++ b/arch/mips/bits/signal.h
@@ -66,7 +66,6 @@ typedef struct __ucontext {
 #define SA_RESTART    0x10000000
 #define SA_NODEFER    0x40000000
 #define SA_RESETHAND  0x80000000
-#define SA_RESTORER   0x04000000
 
 #undef SIG_BLOCK
 #undef SIG_UNBLOCK
@@ -93,7 +92,7 @@ typedef struct __ucontext {
 #define SIGTRAP   5
 #define SIGABRT   6
 #define SIGIOT    SIGABRT
-#define SIGSTKFLT 7
+#define SIGEMT    7
 #define SIGFPE    8
 #define SIGKILL   9
 #define SIGBUS    10
diff --git a/arch/mips/bits/stdint.h b/arch/mips/bits/stdint.h
deleted file mode 100644
index d1b27121..00000000
--- a/arch/mips/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT32_MIN
-#define INTPTR_MAX      INT32_MAX
-#define UINTPTR_MAX     UINT32_MAX
-#define PTRDIFF_MIN     INT32_MIN
-#define PTRDIFF_MAX     INT32_MAX
-#define SIZE_MAX        UINT32_MAX
diff --git a/arch/mips/bits/syscall.h.in b/arch/mips/bits/syscall.h.in
index 86251bf3..55e35742 100644
--- a/arch/mips/bits/syscall.h.in
+++ b/arch/mips/bits/syscall.h.in
@@ -408,4 +408,19 @@
 #define __NR_fspick		4433
 #define __NR_pidfd_open		4434
 #define __NR_clone3		4435
+#define __NR_close_range	4436
+#define __NR_openat2		4437
+#define __NR_pidfd_getfd	4438
+#define __NR_faccessat2		4439
+#define __NR_process_madvise	4440
+#define __NR_epoll_pwait2	4441
+#define __NR_mount_setattr	4442
+#define __NR_landlock_create_ruleset	4444
+#define __NR_landlock_add_rule	4445
+#define __NR_landlock_restrict_self	4446
+#define __NR_process_mrelease	4448
+#define __NR_futex_waitv	4449
+#define __NR_set_mempolicy_home_node	4450
+#define __NR_cachestat		4451
+#define __NR_fchmodat2		4452
 
diff --git a/arch/mips/ksigaction.h b/arch/mips/ksigaction.h
index 63fdfab0..485abf75 100644
--- a/arch/mips/ksigaction.h
+++ b/arch/mips/ksigaction.h
@@ -4,10 +4,7 @@ struct k_sigaction {
 	unsigned flags;
 	void (*handler)(int);
 	unsigned long mask[4];
-	/* The following field is past the end of the structure the
-	 * kernel will read or write, and exists only to avoid having
-	 * mips-specific preprocessor conditionals in sigaction.c. */
-	void (*restorer)();
+	void *unused;
 };
 
 hidden void __restore(), __restore_rt();
diff --git a/arch/mips/pthread_arch.h b/arch/mips/pthread_arch.h
index 1e7839ea..376b7741 100644
--- a/arch/mips/pthread_arch.h
+++ b/arch/mips/pthread_arch.h
@@ -1,19 +1,18 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
+	register uintptr_t tp __asm__("$3");
 #if __mips_isa_rev < 2
-	register char *tp __asm__("$3");
 	__asm__ (".word 0x7c03e83b" : "=r" (tp) );
 #else
-	char *tp;
 	__asm__ ("rdhwr %0, $29" : "=r" (tp) );
 #endif
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 #define MC_PC pc
diff --git a/arch/mips/reloc.h b/arch/mips/reloc.h
index 88d23639..f4023b16 100644
--- a/arch/mips/reloc.h
+++ b/arch/mips/reloc.h
@@ -29,6 +29,7 @@
 
 #define NEED_MIPS_GOT_RELOCS 1
 #define DT_DEBUG_INDIRECT DT_MIPS_RLD_MAP
+#define DT_DEBUG_INDIRECT_REL DT_MIPS_RLD_MAP_REL
 #define ARCH_SYM_REJECT_UND(s) (!((s)->st_other & STO_MIPS_PLT))
 
 #define CRTJMP(pc,sp) __asm__ __volatile__( \
diff --git a/arch/mips/syscall_arch.h b/arch/mips/syscall_arch.h
index f821e73f..5b7c38de 100644
--- a/arch/mips/syscall_arch.h
+++ b/arch/mips/syscall_arch.h
@@ -18,26 +18,26 @@
 static inline long __syscall0(long n)
 {
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
-		"syscall"
-		: "+r"(r2), "=r"(r7)
-		:
+		"addu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2)
 		: SYSCALL_CLOBBERLIST, "$8", "$9", "$10");
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall1(long n, long a)
 {
 	register long r4 __asm__("$4") = a;
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
-		"syscall"
-		: "+r"(r2), "=r"(r7)
-		: "r"(r4)
+		"addu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4)
 		: SYSCALL_CLOBBERLIST, "$8", "$9", "$10");
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall2(long n, long a, long b)
@@ -45,13 +45,13 @@ static inline long __syscall2(long n, long a, long b)
 	register long r4 __asm__("$4") = a;
 	register long r5 __asm__("$5") = b;
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
-		"syscall"
-		: "+r"(r2), "=r"(r7)
-		: "r"(r4), "r"(r5)
+		"addu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5)
 		: SYSCALL_CLOBBERLIST, "$8", "$9", "$10");
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall3(long n, long a, long b, long c)
@@ -60,13 +60,13 @@ static inline long __syscall3(long n, long a, long b, long c)
 	register long r5 __asm__("$5") = b;
 	register long r6 __asm__("$6") = c;
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
-		"syscall"
-		: "+r"(r2), "=r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6)
+		"addu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6)
 		: SYSCALL_CLOBBERLIST, "$8", "$9", "$10");
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall4(long n, long a, long b, long c, long d)
@@ -75,13 +75,13 @@ static inline long __syscall4(long n, long a, long b, long c, long d)
 	register long r5 __asm__("$5") = b;
 	register long r6 __asm__("$6") = c;
 	register long r7 __asm__("$7") = d;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
-		"syscall"
-		: "+r"(r2), "+r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6)
+		"addu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "+r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6)
 		: SYSCALL_CLOBBERLIST, "$8", "$9", "$10");
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall5(long n, long a, long b, long c, long d, long e)
@@ -91,15 +91,15 @@ static inline long __syscall5(long n, long a, long b, long c, long d, long e)
 	register long r6 __asm__("$6") = c;
 	register long r7 __asm__("$7") = d;
 	register long r8 __asm__("$8") = e;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
 		"subu $sp,$sp,32 ; sw $8,16($sp) ; "
-		"syscall ;"
+		"addu $2,$0,%3 ; syscall ;"
 		"addu $sp,$sp,32"
-		: "+r"(r2), "+r"(r7), "+r"(r8)
-		: "r"(r4), "r"(r5), "r"(r6)
+		: "=&r"(r2), "+r"(r7), "+r"(r8)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6)
 		: SYSCALL_CLOBBERLIST, "$9", "$10");
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
@@ -110,15 +110,15 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 	register long r7 __asm__("$7") = d;
 	register long r8 __asm__("$8") = e;
 	register long r9 __asm__("$9") = f;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
 		"subu $sp,$sp,32 ; sw $8,16($sp) ; sw $9,20($sp) ; "
-		"syscall ;"
+		"addu $2,$0,%4 ; syscall ;"
 		"addu $sp,$sp,32"
-		: "+r"(r2), "+r"(r7), "+r"(r8), "+r"(r9)
-		: "r"(r4), "r"(r5), "r"(r6)
+		: "=&r"(r2), "+r"(r7), "+r"(r8), "+r"(r9)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6)
 		: SYSCALL_CLOBBERLIST, "$10");
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall7(long n, long a, long b, long c, long d, long e, long f, long g)
@@ -130,15 +130,15 @@ static inline long __syscall7(long n, long a, long b, long c, long d, long e, lo
 	register long r8 __asm__("$8") = e;
 	register long r9 __asm__("$9") = f;
 	register long r10 __asm__("$10") = g;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
 		"subu $sp,$sp,32 ; sw $8,16($sp) ; sw $9,20($sp) ; sw $10,24($sp) ; "
-		"syscall ;"
+		"addu $2,$0,%5 ; syscall ;"
 		"addu $sp,$sp,32"
-		: "+r"(r2), "+r"(r7), "+r"(r8), "+r"(r9), "+r"(r10)
-		: "r"(r4), "r"(r5), "r"(r6)
+		: "=&r"(r2), "+r"(r7), "+r"(r8), "+r"(r9), "+r"(r10)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 #define VDSO_USEFUL
@@ -149,3 +149,5 @@ static inline long __syscall7(long n, long a, long b, long c, long d, long e, lo
 
 #define SO_SNDTIMEO_OLD 0x1005
 #define SO_RCVTIMEO_OLD 0x1006
+
+#undef SYS_socketcall
diff --git a/arch/mips64/bits/fcntl.h b/arch/mips64/bits/fcntl.h
index 3bcec15e..5da1eef8 100644
--- a/arch/mips64/bits/fcntl.h
+++ b/arch/mips64/bits/fcntl.h
@@ -13,7 +13,7 @@
 
 #define O_ASYNC      010000
 #define O_DIRECT    0100000
-#define O_LARGEFILE       0
+#define O_LARGEFILE  020000
 #define O_NOATIME  01000000
 #define O_PATH    010000000
 #define O_TMPFILE 020200000
diff --git a/arch/mips64/bits/posix.h b/arch/mips64/bits/posix.h
deleted file mode 100644
index acf42944..00000000
--- a/arch/mips64/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_LP64_OFFBIG 1
-#define _POSIX_V7_LP64_OFFBIG 1
diff --git a/arch/mips64/bits/reg.h b/arch/mips64/bits/reg.h
index a3f63acc..16178dd3 100644
--- a/arch/mips64/bits/reg.h
+++ b/arch/mips64/bits/reg.h
@@ -1,6 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
-
 #define EF_R0 0
 #define EF_R1 1
 #define EF_R2 2
diff --git a/arch/mips64/bits/signal.h b/arch/mips64/bits/signal.h
index c31ad07e..ffec7fd0 100644
--- a/arch/mips64/bits/signal.h
+++ b/arch/mips64/bits/signal.h
@@ -85,7 +85,6 @@ typedef struct __ucontext {
 #define SA_RESTART    0x10000000
 #define SA_NODEFER    0x40000000
 #define SA_RESETHAND  0x80000000
-#define SA_RESTORER   0x04000000
 
 #undef SIG_BLOCK
 #undef SIG_UNBLOCK
@@ -112,7 +111,7 @@ typedef struct __ucontext {
 #define SIGTRAP   5
 #define SIGABRT   6
 #define SIGIOT    SIGABRT
-#define SIGSTKFLT 7
+#define SIGEMT    7
 #define SIGFPE    8
 #define SIGKILL   9
 #define SIGBUS    10
diff --git a/arch/mips64/bits/stdint.h b/arch/mips64/bits/stdint.h
deleted file mode 100644
index 1bb147f2..00000000
--- a/arch/mips64/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT64_MIN
-#define INTPTR_MAX      INT64_MAX
-#define UINTPTR_MAX     UINT64_MAX
-#define PTRDIFF_MIN     INT64_MIN
-#define PTRDIFF_MAX     INT64_MAX
-#define SIZE_MAX        UINT64_MAX
diff --git a/arch/mips64/bits/syscall.h.in b/arch/mips64/bits/syscall.h.in
index 9b406e9a..50cec45a 100644
--- a/arch/mips64/bits/syscall.h.in
+++ b/arch/mips64/bits/syscall.h.in
@@ -338,4 +338,19 @@
 #define __NR_fspick		5433
 #define __NR_pidfd_open		5434
 #define __NR_clone3		5435
+#define __NR_close_range	5436
+#define __NR_openat2		5437
+#define __NR_pidfd_getfd	5438
+#define __NR_faccessat2		5439
+#define __NR_process_madvise	5440
+#define __NR_epoll_pwait2	5441
+#define __NR_mount_setattr	5442
+#define __NR_landlock_create_ruleset	5444
+#define __NR_landlock_add_rule	5445
+#define __NR_landlock_restrict_self	5446
+#define __NR_process_mrelease	5448
+#define __NR_futex_waitv	5449
+#define __NR_set_mempolicy_home_node	5450
+#define __NR_cachestat		5451
+#define __NR_fchmodat2		5452
 
diff --git a/arch/mips64/ksigaction.h b/arch/mips64/ksigaction.h
index c16e4731..b4d0fa5f 100644
--- a/arch/mips64/ksigaction.h
+++ b/arch/mips64/ksigaction.h
@@ -4,7 +4,7 @@ struct k_sigaction {
 	unsigned flags;
 	void (*handler)(int);
 	unsigned long mask[2];
-	void (*restorer)();
+	void *unused;
 };
 
 hidden void __restore(), __restore_rt();
diff --git a/arch/mips64/pthread_arch.h b/arch/mips64/pthread_arch.h
index 1e7839ea..c45347ab 100644
--- a/arch/mips64/pthread_arch.h
+++ b/arch/mips64/pthread_arch.h
@@ -1,19 +1,19 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
 #if __mips_isa_rev < 2
-	register char *tp __asm__("$3");
+	register uintptr_t tp __asm__("$3");
 	__asm__ (".word 0x7c03e83b" : "=r" (tp) );
 #else
-	char *tp;
+	uintptr_t tp;
 	__asm__ ("rdhwr %0, $29" : "=r" (tp) );
 #endif
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 #define MC_PC pc
diff --git a/arch/mips64/reloc.h b/arch/mips64/reloc.h
index fdb5edc9..145d8b0b 100644
--- a/arch/mips64/reloc.h
+++ b/arch/mips64/reloc.h
@@ -38,6 +38,7 @@
 
 #define NEED_MIPS_GOT_RELOCS 1
 #define DT_DEBUG_INDIRECT DT_MIPS_RLD_MAP
+#define DT_DEBUG_INDIRECT_REL DT_MIPS_RLD_MAP_REL
 #define ARCH_SYM_REJECT_UND(s) (!((s)->st_other & STO_MIPS_PLT))
 
 #define CRTJMP(pc,sp) __asm__ __volatile__( \
diff --git a/arch/mips64/syscall_arch.h b/arch/mips64/syscall_arch.h
index 69c429b8..ae6532fc 100644
--- a/arch/mips64/syscall_arch.h
+++ b/arch/mips64/syscall_arch.h
@@ -16,26 +16,26 @@
 static inline long __syscall0(long n)
 {
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "=r"(r7)
-		:
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall1(long n, long a)
 {
 	register long r4 __asm__("$4") = a;
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "=r"(r7)
-		: "r"(r4)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall2(long n, long a, long b)
@@ -43,14 +43,14 @@ static inline long __syscall2(long n, long a, long b)
 	register long r4 __asm__("$4") = a;
 	register long r5 __asm__("$5") = b;
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "=r"(r7)
-		: "r"(r4), "r"(r5)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall3(long n, long a, long b, long c)
@@ -59,14 +59,14 @@ static inline long __syscall3(long n, long a, long b, long c)
 	register long r5 __asm__("$5") = b;
 	register long r6 __asm__("$6") = c;
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "=r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall4(long n, long a, long b, long c, long d)
@@ -75,14 +75,14 @@ static inline long __syscall4(long n, long a, long b, long c, long d)
 	register long r5 __asm__("$5") = b;
 	register long r6 __asm__("$6") = c;
 	register long r7 __asm__("$7") = d;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "+r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "+r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall5(long n, long a, long b, long c, long d, long e)
@@ -92,14 +92,14 @@ static inline long __syscall5(long n, long a, long b, long c, long d, long e)
 	register long r6 __asm__("$6") = c;
 	register long r7 __asm__("$7") = d;
 	register long r8 __asm__("$8") = e;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "+r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6), "r"(r8)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "+r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6), "r"(r8)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
@@ -110,14 +110,14 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 	register long r7 __asm__("$7") = d;
 	register long r8 __asm__("$8") = e;
 	register long r9 __asm__("$9") = f;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "+r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6), "r"(r8), "r"(r9)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "+r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6), "r"(r8), "r"(r9)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 #define VDSO_USEFUL
diff --git a/arch/mipsn32/bits/posix.h b/arch/mipsn32/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/mipsn32/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/mipsn32/bits/reg.h b/arch/mipsn32/bits/reg.h
index a3f63acc..16178dd3 100644
--- a/arch/mipsn32/bits/reg.h
+++ b/arch/mipsn32/bits/reg.h
@@ -1,6 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
-
 #define EF_R0 0
 #define EF_R1 1
 #define EF_R2 2
diff --git a/arch/mipsn32/bits/signal.h b/arch/mipsn32/bits/signal.h
index c31ad07e..ffec7fd0 100644
--- a/arch/mipsn32/bits/signal.h
+++ b/arch/mipsn32/bits/signal.h
@@ -85,7 +85,6 @@ typedef struct __ucontext {
 #define SA_RESTART    0x10000000
 #define SA_NODEFER    0x40000000
 #define SA_RESETHAND  0x80000000
-#define SA_RESTORER   0x04000000
 
 #undef SIG_BLOCK
 #undef SIG_UNBLOCK
@@ -112,7 +111,7 @@ typedef struct __ucontext {
 #define SIGTRAP   5
 #define SIGABRT   6
 #define SIGIOT    SIGABRT
-#define SIGSTKFLT 7
+#define SIGEMT    7
 #define SIGFPE    8
 #define SIGKILL   9
 #define SIGBUS    10
diff --git a/arch/mipsn32/bits/stdint.h b/arch/mipsn32/bits/stdint.h
deleted file mode 100644
index d1b27121..00000000
--- a/arch/mipsn32/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT32_MIN
-#define INTPTR_MAX      INT32_MAX
-#define UINTPTR_MAX     UINT32_MAX
-#define PTRDIFF_MIN     INT32_MIN
-#define PTRDIFF_MAX     INT32_MAX
-#define SIZE_MAX        UINT32_MAX
diff --git a/arch/mipsn32/bits/syscall.h.in b/arch/mipsn32/bits/syscall.h.in
index 2ad48d10..9a4bd301 100644
--- a/arch/mipsn32/bits/syscall.h.in
+++ b/arch/mipsn32/bits/syscall.h.in
@@ -362,4 +362,19 @@
 #define __NR_fspick		6433
 #define __NR_pidfd_open		6434
 #define __NR_clone3		6435
+#define __NR_close_range	6436
+#define __NR_openat2		6437
+#define __NR_pidfd_getfd	6438
+#define __NR_faccessat2		6439
+#define __NR_process_madvise	6440
+#define __NR_epoll_pwait2	6441
+#define __NR_mount_setattr	6442
+#define __NR_landlock_create_ruleset	6444
+#define __NR_landlock_add_rule	6445
+#define __NR_landlock_restrict_self	6446
+#define __NR_process_mrelease	6448
+#define __NR_futex_waitv	6449
+#define __NR_set_mempolicy_home_node	6450
+#define __NR_cachestat		6451
+#define __NR_fchmodat2		6452
 
diff --git a/arch/mipsn32/ksigaction.h b/arch/mipsn32/ksigaction.h
index b565f1fc..485abf75 100644
--- a/arch/mipsn32/ksigaction.h
+++ b/arch/mipsn32/ksigaction.h
@@ -4,7 +4,7 @@ struct k_sigaction {
 	unsigned flags;
 	void (*handler)(int);
 	unsigned long mask[4];
-	void (*restorer)();
+	void *unused;
 };
 
 hidden void __restore(), __restore_rt();
diff --git a/arch/mipsn32/pthread_arch.h b/arch/mipsn32/pthread_arch.h
index 1e7839ea..c45347ab 100644
--- a/arch/mipsn32/pthread_arch.h
+++ b/arch/mipsn32/pthread_arch.h
@@ -1,19 +1,19 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
 #if __mips_isa_rev < 2
-	register char *tp __asm__("$3");
+	register uintptr_t tp __asm__("$3");
 	__asm__ (".word 0x7c03e83b" : "=r" (tp) );
 #else
-	char *tp;
+	uintptr_t tp;
 	__asm__ ("rdhwr %0, $29" : "=r" (tp) );
 #endif
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 #define MC_PC pc
diff --git a/arch/mipsn32/reloc.h b/arch/mipsn32/reloc.h
index 47c6e591..bf00bd6a 100644
--- a/arch/mipsn32/reloc.h
+++ b/arch/mipsn32/reloc.h
@@ -29,6 +29,7 @@
 
 #define NEED_MIPS_GOT_RELOCS 1
 #define DT_DEBUG_INDIRECT DT_MIPS_RLD_MAP
+#define DT_DEBUG_INDIRECT_REL DT_MIPS_RLD_MAP_REL
 #define ARCH_SYM_REJECT_UND(s) (!((s)->st_other & STO_MIPS_PLT))
 
 #define CRTJMP(pc,sp) __asm__ __volatile__( \
diff --git a/arch/mipsn32/syscall_arch.h b/arch/mipsn32/syscall_arch.h
index c1a4b7da..c681905d 100644
--- a/arch/mipsn32/syscall_arch.h
+++ b/arch/mipsn32/syscall_arch.h
@@ -16,26 +16,26 @@
 static inline long __syscall0(long n)
 {
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "=r"(r7)
-		:
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall1(long n, long a)
 {
 	register long r4 __asm__("$4") = a;
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "=r"(r7)
-		: "r"(r4)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall2(long n, long a, long b)
@@ -43,13 +43,14 @@ static inline long __syscall2(long n, long a, long b)
 	register long r4 __asm__("$4") = a;
 	register long r5 __asm__("$5") = b;
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
+
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "=r"(r7)
-		: "r"(r4), "r"(r5)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall3(long n, long a, long b, long c)
@@ -58,13 +59,14 @@ static inline long __syscall3(long n, long a, long b, long c)
 	register long r5 __asm__("$5") = b;
 	register long r6 __asm__("$6") = c;
 	register long r7 __asm__("$7");
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
+
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "=r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "=r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall4(long n, long a, long b, long c, long d)
@@ -73,13 +75,14 @@ static inline long __syscall4(long n, long a, long b, long c, long d)
 	register long r5 __asm__("$5") = b;
 	register long r6 __asm__("$6") = c;
 	register long r7 __asm__("$7") = d;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
+
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "+r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "+r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall5(long n, long a, long b, long c, long d, long e)
@@ -89,13 +92,14 @@ static inline long __syscall5(long n, long a, long b, long c, long d, long e)
 	register long r6 __asm__("$6") = c;
 	register long r7 __asm__("$7") = d;
 	register long r8 __asm__("$8") = e;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
+
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "+r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6), "r"(r8)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "+r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6), "r"(r8)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 static inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
@@ -106,13 +110,14 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 	register long r7 __asm__("$7") = d;
 	register long r8 __asm__("$8") = e;
 	register long r9 __asm__("$9") = f;
-	register long r2 __asm__("$2") = n;
+	register long r2 __asm__("$2");
+
 	__asm__ __volatile__ (
-		"syscall"
-		: "+&r"(r2), "+r"(r7)
-		: "r"(r4), "r"(r5), "r"(r6), "r"(r8), "r"(r9)
+		"daddu $2,$0,%2 ; syscall"
+		: "=&r"(r2), "+r"(r7)
+		: "ir"(n), "0"(r2), "r"(r4), "r"(r5), "r"(r6), "r"(r8), "r"(r9)
 		: SYSCALL_CLOBBERLIST);
-	return r7 ? -r2 : r2;
+	return r7 && r2>0 ? -r2 : r2;
 }
 
 #define VDSO_USEFUL
diff --git a/arch/or1k/bits/posix.h b/arch/or1k/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/or1k/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/or1k/bits/reg.h b/arch/or1k/bits/reg.h
deleted file mode 100644
index 0c7bffca..00000000
--- a/arch/or1k/bits/reg.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#undef __WORDSIZE
-#define __WORDSIZE 32
-/* FIXME */
diff --git a/arch/or1k/bits/signal.h b/arch/or1k/bits/signal.h
index be576d1d..c45be676 100644
--- a/arch/or1k/bits/signal.h
+++ b/arch/or1k/bits/signal.h
@@ -43,7 +43,6 @@ typedef struct __ucontext {
 #define SA_RESTART    0x10000000
 #define SA_NODEFER    0x40000000
 #define SA_RESETHAND  0x80000000
-#define SA_RESTORER   0x04000000
 
 #endif
 
diff --git a/arch/or1k/bits/stdint.h b/arch/or1k/bits/stdint.h
deleted file mode 100644
index d1b27121..00000000
--- a/arch/or1k/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT32_MIN
-#define INTPTR_MAX      INT32_MAX
-#define UINTPTR_MAX     UINT32_MAX
-#define PTRDIFF_MIN     INT32_MIN
-#define PTRDIFF_MAX     INT32_MAX
-#define SIZE_MAX        UINT32_MAX
diff --git a/arch/or1k/bits/syscall.h.in b/arch/or1k/bits/syscall.h.in
index e9c925e4..00812bf8 100644
--- a/arch/or1k/bits/syscall.h.in
+++ b/arch/or1k/bits/syscall.h.in
@@ -311,4 +311,19 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
diff --git a/arch/or1k/pthread_arch.h b/arch/or1k/pthread_arch.h
index 1b806f89..f75ea7e4 100644
--- a/arch/or1k/pthread_arch.h
+++ b/arch/or1k/pthread_arch.h
@@ -1,18 +1,16 @@
-/* or1k use variant I, but with the twist that tp points to the end of TCB */
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
 #ifdef __clang__
-	char *tp;
+	uintptr_t tp;
 	__asm__ ("l.ori %0, r10, 0" : "=r" (tp) );
 #else
-	register char *tp __asm__("r10");
+	register uintptr_t tp __asm__("r10");
 	__asm__ ("" : "=r" (tp) );
 #endif
-	return (struct pthread *) (tp - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread))
 
 #define MC_PC regs.pc
diff --git a/arch/powerpc/bits/fenv.h b/arch/powerpc/bits/fenv.h
index c5a3e5c5..5b15c69a 100644
--- a/arch/powerpc/bits/fenv.h
+++ b/arch/powerpc/bits/fenv.h
@@ -1,4 +1,4 @@
-#ifdef _SOFT_FLOAT
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)
 #define FE_ALL_EXCEPT 0
 #define FE_TONEAREST  0
 #else
diff --git a/arch/powerpc/bits/hwcap.h b/arch/powerpc/bits/hwcap.h
index 803de9b5..12981623 100644
--- a/arch/powerpc/bits/hwcap.h
+++ b/arch/powerpc/bits/hwcap.h
@@ -41,3 +41,5 @@
 #define PPC_FEATURE2_DARN		0x00200000
 #define PPC_FEATURE2_SCV		0x00100000
 #define PPC_FEATURE2_HTM_NO_SUSPEND	0x00080000
+#define PPC_FEATURE2_ARCH_3_1		0x00040000
+#define PPC_FEATURE2_MMA		0x00020000
diff --git a/arch/powerpc/bits/mman.h b/arch/powerpc/bits/mman.h
index b3a675a8..95ec4358 100644
--- a/arch/powerpc/bits/mman.h
+++ b/arch/powerpc/bits/mman.h
@@ -4,7 +4,6 @@
 #define MAP_NORESERVE   0x40
 #undef MAP_LOCKED
 #define MAP_LOCKED	0x80
-#undef MAP_SYNC
 
 #undef MCL_CURRENT
 #define MCL_CURRENT     0x2000
diff --git a/arch/powerpc/bits/posix.h b/arch/powerpc/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/powerpc/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/powerpc/bits/reg.h b/arch/powerpc/bits/reg.h
deleted file mode 100644
index 0c7bffca..00000000
--- a/arch/powerpc/bits/reg.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#undef __WORDSIZE
-#define __WORDSIZE 32
-/* FIXME */
diff --git a/arch/powerpc/bits/shm.h b/arch/powerpc/bits/shm.h
index fb1d4020..7f1ca17e 100644
--- a/arch/powerpc/bits/shm.h
+++ b/arch/powerpc/bits/shm.h
@@ -8,11 +8,11 @@ struct shmid_ds {
 	unsigned long __shm_dtime_lo;
 	unsigned long __shm_ctime_hi;
 	unsigned long __shm_ctime_lo;
+	unsigned long __pad1;
 	size_t shm_segsz;
 	pid_t shm_cpid;
 	pid_t shm_lpid;
 	unsigned long shm_nattch;
-	unsigned long __pad1;
 	unsigned long __pad2;
 	time_t shm_atime;
 	time_t shm_dtime;
diff --git a/arch/powerpc/bits/stdint.h b/arch/powerpc/bits/stdint.h
deleted file mode 100644
index d1b27121..00000000
--- a/arch/powerpc/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT32_MIN
-#define INTPTR_MAX      INT32_MAX
-#define UINTPTR_MAX     UINT32_MAX
-#define PTRDIFF_MIN     INT32_MIN
-#define PTRDIFF_MAX     INT32_MAX
-#define SIZE_MAX        UINT32_MAX
diff --git a/arch/powerpc/bits/syscall.h.in b/arch/powerpc/bits/syscall.h.in
index 8d4f79b5..ea95f3ed 100644
--- a/arch/powerpc/bits/syscall.h.in
+++ b/arch/powerpc/bits/syscall.h.in
@@ -415,4 +415,19 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
diff --git a/arch/powerpc/pthread_arch.h b/arch/powerpc/pthread_arch.h
index ae0f28d6..42e88b07 100644
--- a/arch/powerpc/pthread_arch.h
+++ b/arch/powerpc/pthread_arch.h
@@ -1,18 +1,16 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	register char *tp __asm__("r2");
+	register uintptr_t tp __asm__("r2");
 	__asm__ ("" : "=r" (tp) );
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
                         
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 // the kernel calls the ip "nip", it's the first saved value after the 32
 // GPRs.
 #define MC_PC gregs[32]
-
-#define CANARY canary_at_end
diff --git a/arch/powerpc/reloc.h b/arch/powerpc/reloc.h
index 527b6b7c..fdfbf827 100644
--- a/arch/powerpc/reloc.h
+++ b/arch/powerpc/reloc.h
@@ -1,4 +1,4 @@
-#ifdef _SOFT_FLOAT
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)
 #define FP_SUFFIX "-sf"
 #else
 #define FP_SUFFIX ""
diff --git a/arch/powerpc/syscall_arch.h b/arch/powerpc/syscall_arch.h
index ede97c1c..54c885cb 100644
--- a/arch/powerpc/syscall_arch.h
+++ b/arch/powerpc/syscall_arch.h
@@ -92,3 +92,9 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 
 #define SO_RCVTIMEO_OLD  18
 #define SO_SNDTIMEO_OLD  19
+
+#define VDSO_USEFUL
+#define VDSO_CGT32_SYM "__kernel_clock_gettime"
+#define VDSO_CGT32_VER "LINUX_2.6.15"
+#define VDSO_CGT_SYM "__kernel_clock_gettime64"
+#define VDSO_CGT_VER "LINUX_5.11"
diff --git a/arch/powerpc64/bits/hwcap.h b/arch/powerpc64/bits/hwcap.h
index 803de9b5..12981623 100644
--- a/arch/powerpc64/bits/hwcap.h
+++ b/arch/powerpc64/bits/hwcap.h
@@ -41,3 +41,5 @@
 #define PPC_FEATURE2_DARN		0x00200000
 #define PPC_FEATURE2_SCV		0x00100000
 #define PPC_FEATURE2_HTM_NO_SUSPEND	0x00080000
+#define PPC_FEATURE2_ARCH_3_1		0x00040000
+#define PPC_FEATURE2_MMA		0x00020000
diff --git a/arch/powerpc64/bits/mman.h b/arch/powerpc64/bits/mman.h
index b3a675a8..95ec4358 100644
--- a/arch/powerpc64/bits/mman.h
+++ b/arch/powerpc64/bits/mman.h
@@ -4,7 +4,6 @@
 #define MAP_NORESERVE   0x40
 #undef MAP_LOCKED
 #define MAP_LOCKED	0x80
-#undef MAP_SYNC
 
 #undef MCL_CURRENT
 #define MCL_CURRENT     0x2000
diff --git a/arch/powerpc64/bits/posix.h b/arch/powerpc64/bits/posix.h
deleted file mode 100644
index c37b94c1..00000000
--- a/arch/powerpc64/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_LP64_OFF64  1
-#define _POSIX_V7_LP64_OFF64  1
diff --git a/arch/powerpc64/bits/reg.h b/arch/powerpc64/bits/reg.h
deleted file mode 100644
index 49382c8f..00000000
--- a/arch/powerpc64/bits/reg.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
-/* FIXME */
diff --git a/arch/powerpc64/bits/stdint.h b/arch/powerpc64/bits/stdint.h
deleted file mode 100644
index 1bb147f2..00000000
--- a/arch/powerpc64/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT64_MIN
-#define INTPTR_MAX      INT64_MAX
-#define UINTPTR_MAX     UINT64_MAX
-#define PTRDIFF_MIN     INT64_MIN
-#define PTRDIFF_MAX     INT64_MAX
-#define SIZE_MAX        UINT64_MAX
diff --git a/arch/powerpc64/bits/syscall.h.in b/arch/powerpc64/bits/syscall.h.in
index b935864c..43551079 100644
--- a/arch/powerpc64/bits/syscall.h.in
+++ b/arch/powerpc64/bits/syscall.h.in
@@ -387,4 +387,19 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
diff --git a/arch/powerpc64/pthread_arch.h b/arch/powerpc64/pthread_arch.h
index 79c3ecd8..1b7b9079 100644
--- a/arch/powerpc64/pthread_arch.h
+++ b/arch/powerpc64/pthread_arch.h
@@ -1,18 +1,16 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	register char *tp __asm__("r13");
+	register uintptr_t tp __asm__("r13");
 	__asm__ ("" : "=r" (tp) );
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 // the kernel calls the ip "nip", it's the first saved value after the 32
 // GPRs.
 #define MC_PC gp_regs[32]
-
-#define CANARY canary_at_end
diff --git a/arch/powerpc64/syscall_arch.h b/arch/powerpc64/syscall_arch.h
index 76b4e335..7d34fbe4 100644
--- a/arch/powerpc64/syscall_arch.h
+++ b/arch/powerpc64/syscall_arch.h
@@ -88,3 +88,7 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 
 #define SO_RCVTIMEO_OLD  18
 #define SO_SNDTIMEO_OLD  19
+
+#define VDSO_USEFUL
+#define VDSO_CGT_SYM "__kernel_clock_gettime"
+#define VDSO_CGT_VER "LINUX_2.6.15"
diff --git a/arch/riscv32/atomic_arch.h b/arch/riscv32/atomic_arch.h
new file mode 100644
index 00000000..4d418f63
--- /dev/null
+++ b/arch/riscv32/atomic_arch.h
@@ -0,0 +1,21 @@
+#define a_barrier a_barrier
+static inline void a_barrier()
+{
+	__asm__ __volatile__ ("fence rw,rw" : : : "memory");
+}
+
+#define a_cas a_cas
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	int old, tmp;
+	__asm__ __volatile__ (
+		"\n1:	lr.w.aqrl %0, (%2)\n"
+		"	bne %0, %3, 1f\n"
+		"	sc.w.aqrl %1, %4, (%2)\n"
+		"	bnez %1, 1b\n"
+		"1:"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(p), "r"((long)t), "r"((long)s)
+		: "memory");
+	return old;
+}
diff --git a/arch/riscv32/bits/alltypes.h.in b/arch/riscv32/bits/alltypes.h.in
new file mode 100644
index 00000000..e2b6129e
--- /dev/null
+++ b/arch/riscv32/bits/alltypes.h.in
@@ -0,0 +1,18 @@
+#define _Addr int
+#define _Int64 long long
+#define _Reg int
+
+#define __BYTE_ORDER 1234
+#define __LONG_MAX 0x7fffffffL
+
+#ifndef __cplusplus
+TYPEDEF int wchar_t;
+#endif
+
+TYPEDEF int blksize_t;
+TYPEDEF unsigned int nlink_t;
+
+TYPEDEF float float_t;
+TYPEDEF double double_t;
+
+TYPEDEF struct { long long __ll; long double __ld; } max_align_t;
diff --git a/arch/riscv32/bits/fenv.h b/arch/riscv32/bits/fenv.h
new file mode 100644
index 00000000..806ec40f
--- /dev/null
+++ b/arch/riscv32/bits/fenv.h
@@ -0,0 +1,17 @@
+#define FE_INVALID      16
+#define FE_DIVBYZERO    8
+#define FE_OVERFLOW     4
+#define FE_UNDERFLOW    2
+#define FE_INEXACT      1
+
+#define FE_ALL_EXCEPT   31
+
+#define FE_TONEAREST    0
+#define FE_DOWNWARD     2
+#define FE_UPWARD       3
+#define FE_TOWARDZERO   1
+
+typedef unsigned int fexcept_t;
+typedef unsigned int fenv_t;
+
+#define FE_DFL_ENV      ((const fenv_t *) -1)
diff --git a/arch/riscv32/bits/float.h b/arch/riscv32/bits/float.h
new file mode 100644
index 00000000..719c7908
--- /dev/null
+++ b/arch/riscv32/bits/float.h
@@ -0,0 +1,16 @@
+#define FLT_EVAL_METHOD 0
+
+#define LDBL_TRUE_MIN 6.47517511943802511092443895822764655e-4966L
+#define LDBL_MIN 3.36210314311209350626267781732175260e-4932L
+#define LDBL_MAX 1.18973149535723176508575932662800702e+4932L
+#define LDBL_EPSILON 1.92592994438723585305597794258492732e-34L
+
+#define LDBL_MANT_DIG 113
+#define LDBL_MIN_EXP (-16381)
+#define LDBL_MAX_EXP 16384
+
+#define LDBL_DIG 33
+#define LDBL_MIN_10_EXP (-4931)
+#define LDBL_MAX_10_EXP 4932
+
+#define DECIMAL_DIG 36
diff --git a/arch/riscv32/bits/ipcstat.h b/arch/riscv32/bits/ipcstat.h
new file mode 100644
index 00000000..4f4fcb0c
--- /dev/null
+++ b/arch/riscv32/bits/ipcstat.h
@@ -0,0 +1 @@
+#define IPC_STAT 0x102
diff --git a/arch/riscv32/bits/msg.h b/arch/riscv32/bits/msg.h
new file mode 100644
index 00000000..7bbbb2bf
--- /dev/null
+++ b/arch/riscv32/bits/msg.h
@@ -0,0 +1,18 @@
+struct msqid_ds {
+	struct ipc_perm msg_perm;
+	unsigned long __msg_stime_lo;
+	unsigned long __msg_stime_hi;
+	unsigned long __msg_rtime_lo;
+	unsigned long __msg_rtime_hi;
+	unsigned long __msg_ctime_lo;
+	unsigned long __msg_ctime_hi;
+	unsigned long msg_cbytes;
+	msgqnum_t msg_qnum;
+	msglen_t msg_qbytes;
+	pid_t msg_lspid;
+	pid_t msg_lrpid;
+	unsigned long __unused[2];
+	time_t msg_stime;
+	time_t msg_rtime;
+	time_t msg_ctime;
+};
diff --git a/arch/riscv32/bits/sem.h b/arch/riscv32/bits/sem.h
new file mode 100644
index 00000000..544e3d2a
--- /dev/null
+++ b/arch/riscv32/bits/sem.h
@@ -0,0 +1,18 @@
+struct semid_ds {
+	struct ipc_perm sem_perm;
+	unsigned long __sem_otime_lo;
+	unsigned long __sem_otime_hi;
+	unsigned long __sem_ctime_lo;
+	unsigned long __sem_ctime_hi;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+	unsigned short sem_nsems;
+	char __sem_nsems_pad[sizeof(long)-sizeof(short)];
+#else
+	char __sem_nsems_pad[sizeof(long)-sizeof(short)];
+	unsigned short sem_nsems;
+#endif
+	long __unused3;
+	long __unused4;
+	time_t sem_otime;
+	time_t sem_ctime;
+};
diff --git a/arch/riscv32/bits/setjmp.h b/arch/riscv32/bits/setjmp.h
new file mode 100644
index 00000000..51e96276
--- /dev/null
+++ b/arch/riscv32/bits/setjmp.h
@@ -0,0 +1 @@
+typedef unsigned long long __jmp_buf[19];
diff --git a/arch/riscv32/bits/shm.h b/arch/riscv32/bits/shm.h
new file mode 100644
index 00000000..725fb469
--- /dev/null
+++ b/arch/riscv32/bits/shm.h
@@ -0,0 +1,31 @@
+#define SHMLBA 4096
+
+struct shmid_ds {
+	struct ipc_perm shm_perm;
+	size_t shm_segsz;
+	unsigned long __shm_atime_lo;
+	unsigned long __shm_atime_hi;
+	unsigned long __shm_dtime_lo;
+	unsigned long __shm_dtime_hi;
+	unsigned long __shm_ctime_lo;
+	unsigned long __shm_ctime_hi;
+	pid_t shm_cpid;
+	pid_t shm_lpid;
+	unsigned long shm_nattch;
+	unsigned long __pad1;
+	unsigned long __pad2;
+	unsigned long __pad3;
+	time_t shm_atime;
+	time_t shm_dtime;
+	time_t shm_ctime;
+};
+
+struct shminfo {
+	unsigned long shmmax, shmmin, shmmni, shmseg, shmall, __unused[4];
+};
+
+struct shm_info {
+	int __used_ids;
+	unsigned long shm_tot, shm_rss, shm_swp;
+	unsigned long __swap_attempts, __swap_successes;
+};
diff --git a/arch/riscv32/bits/signal.h b/arch/riscv32/bits/signal.h
new file mode 100644
index 00000000..50b66ec9
--- /dev/null
+++ b/arch/riscv32/bits/signal.h
@@ -0,0 +1,120 @@
+#if defined(_POSIX_SOURCE) || defined(_POSIX_C_SOURCE) \
+ || defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+
+#if defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+# define MINSIGSTKSZ 2048
+# define SIGSTKSZ 8192
+#endif
+
+typedef unsigned long __riscv_mc_gp_state[32];
+
+struct __riscv_mc_f_ext_state {
+	unsigned int __f[32];
+	unsigned int __fcsr;
+};
+
+struct __riscv_mc_d_ext_state {
+	unsigned long long __f[32];
+	unsigned int __fcsr;
+};
+
+struct __riscv_mc_q_ext_state {
+	unsigned long long __f[64] __attribute__((__aligned__(16)));
+	unsigned int __fcsr;
+	unsigned int __reserved[3];
+};
+
+union __riscv_mc_fp_state {
+	struct __riscv_mc_f_ext_state __f;
+	struct __riscv_mc_d_ext_state __d;
+	struct __riscv_mc_q_ext_state __q;
+};
+
+typedef struct mcontext_t {
+	__riscv_mc_gp_state __gregs;
+	union __riscv_mc_fp_state __fpregs;
+} mcontext_t;
+
+#if defined(_GNU_SOURCE)
+#define REG_PC 0
+#define REG_RA 1
+#define REG_SP 2
+#define REG_TP 4
+#define REG_S0 8
+#define REG_S1 9
+#define REG_A0 10
+#define REG_S2 18
+#endif
+
+#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+typedef unsigned long greg_t;
+typedef unsigned long gregset_t[32];
+typedef union __riscv_mc_fp_state fpregset_t;
+struct sigcontext {
+	gregset_t gregs;
+	fpregset_t fpregs;
+};
+#endif
+
+struct sigaltstack {
+	void *ss_sp;
+	int ss_flags;
+	size_t ss_size;
+};
+
+typedef struct __ucontext
+{
+	unsigned long uc_flags;
+	struct __ucontext *uc_link;
+	stack_t uc_stack;
+	sigset_t uc_sigmask;
+	mcontext_t uc_mcontext;
+} ucontext_t;
+
+#define SA_NOCLDSTOP 1
+#define SA_NOCLDWAIT 2
+#define SA_SIGINFO   4
+#define SA_ONSTACK   0x08000000
+#define SA_RESTART   0x10000000
+#define SA_NODEFER   0x40000000
+#define SA_RESETHAND 0x80000000
+#define SA_RESTORER  0x04000000
+
+#endif
+
+#define SIGHUP     1
+#define SIGINT     2
+#define SIGQUIT    3
+#define SIGILL     4
+#define SIGTRAP    5
+#define SIGABRT    6
+#define SIGIOT     SIGABRT
+#define SIGBUS     7
+#define SIGFPE     8
+#define SIGKILL    9
+#define SIGUSR1   10
+#define SIGSEGV   11
+#define SIGUSR2   12
+#define SIGPIPE   13
+#define SIGALRM   14
+#define SIGTERM   15
+#define SIGSTKFLT 16
+#define SIGCHLD   17
+#define SIGCONT   18
+#define SIGSTOP   19
+#define SIGTSTP   20
+#define SIGTTIN   21
+#define SIGTTOU   22
+#define SIGURG    23
+#define SIGXCPU   24
+#define SIGXFSZ   25
+#define SIGVTALRM 26
+#define SIGPROF   27
+#define SIGWINCH  28
+#define SIGIO     29
+#define SIGPOLL   SIGIO
+#define SIGPWR    30
+#define SIGSYS    31
+#define SIGUNUSED SIGSYS
+
+#define _NSIG     65
diff --git a/arch/riscv32/bits/syscall.h.in b/arch/riscv32/bits/syscall.h.in
new file mode 100644
index 00000000..9228d840
--- /dev/null
+++ b/arch/riscv32/bits/syscall.h.in
@@ -0,0 +1,300 @@
+#define __NR_io_setup 0
+#define __NR_io_destroy 1
+#define __NR_io_submit 2
+#define __NR_io_cancel 3
+#define __NR_setxattr 5
+#define __NR_lsetxattr 6
+#define __NR_fsetxattr 7
+#define __NR_getxattr 8
+#define __NR_lgetxattr 9
+#define __NR_fgetxattr 10
+#define __NR_listxattr 11
+#define __NR_llistxattr 12
+#define __NR_flistxattr 13
+#define __NR_removexattr 14
+#define __NR_lremovexattr 15
+#define __NR_fremovexattr 16
+#define __NR_getcwd 17
+#define __NR_lookup_dcookie 18
+#define __NR_eventfd2 19
+#define __NR_epoll_create1 20
+#define __NR_epoll_ctl 21
+#define __NR_epoll_pwait 22
+#define __NR_dup 23
+#define __NR_dup3 24
+#define __NR_fcntl64 25
+#define __NR_inotify_init1 26
+#define __NR_inotify_add_watch 27
+#define __NR_inotify_rm_watch 28
+#define __NR_ioctl 29
+#define __NR_ioprio_set 30
+#define __NR_ioprio_get 31
+#define __NR_flock 32
+#define __NR_mknodat 33
+#define __NR_mkdirat 34
+#define __NR_unlinkat 35
+#define __NR_symlinkat 36
+#define __NR_linkat 37
+#define __NR_umount2 39
+#define __NR_mount 40
+#define __NR_pivot_root 41
+#define __NR_nfsservctl 42
+#define __NR_statfs64 43
+#define __NR_fstatfs64 44
+#define __NR_truncate64 45
+#define __NR_ftruncate64 46
+#define __NR_fallocate 47
+#define __NR_faccessat 48
+#define __NR_chdir 49
+#define __NR_fchdir 50
+#define __NR_chroot 51
+#define __NR_fchmod 52
+#define __NR_fchmodat 53
+#define __NR_fchownat 54
+#define __NR_fchown 55
+#define __NR_openat 56
+#define __NR_close 57
+#define __NR_vhangup 58
+#define __NR_pipe2 59
+#define __NR_quotactl 60
+#define __NR_getdents64 61
+#define __NR__llseek 62
+#define __NR_read 63
+#define __NR_write 64
+#define __NR_readv 65
+#define __NR_writev 66
+#define __NR_pread64 67
+#define __NR_pwrite64 68
+#define __NR_preadv 69
+#define __NR_pwritev 70
+#define __NR_sendfile64 71
+#define __NR_signalfd4 74
+#define __NR_vmsplice 75
+#define __NR_splice 76
+#define __NR_tee 77
+#define __NR_readlinkat 78
+#define __NR_sync 81
+#define __NR_fsync 82
+#define __NR_fdatasync 83
+#define __NR_sync_file_range 84
+#define __NR_timerfd_create 85
+#define __NR_acct 89
+#define __NR_capget 90
+#define __NR_capset 91
+#define __NR_personality 92
+#define __NR_exit 93
+#define __NR_exit_group 94
+#define __NR_waitid 95
+#define __NR_set_tid_address 96
+#define __NR_unshare 97
+#define __NR_set_robust_list 99
+#define __NR_get_robust_list 100
+#define __NR_nanosleep 101
+#define __NR_getitimer 102
+#define __NR_setitimer 103
+#define __NR_kexec_load 104
+#define __NR_init_module 105
+#define __NR_delete_module 106
+#define __NR_timer_create 107
+#define __NR_timer_getoverrun 109
+#define __NR_timer_delete 111
+#define __NR_syslog 116
+#define __NR_ptrace 117
+#define __NR_sched_setparam 118
+#define __NR_sched_setscheduler 119
+#define __NR_sched_getscheduler 120
+#define __NR_sched_getparam 121
+#define __NR_sched_setaffinity 122
+#define __NR_sched_getaffinity 123
+#define __NR_sched_yield 124
+#define __NR_sched_get_priority_max 125
+#define __NR_sched_get_priority_min 126
+#define __NR_restart_syscall 128
+#define __NR_kill 129
+#define __NR_tkill 130
+#define __NR_tgkill 131
+#define __NR_sigaltstack 132
+#define __NR_rt_sigsuspend 133
+#define __NR_rt_sigaction 134
+#define __NR_rt_sigprocmask 135
+#define __NR_rt_sigpending 136
+#define __NR_rt_sigqueueinfo 138
+#define __NR_rt_sigreturn 139
+#define __NR_setpriority 140
+#define __NR_getpriority 141
+#define __NR_reboot 142
+#define __NR_setregid 143
+#define __NR_setgid 144
+#define __NR_setreuid 145
+#define __NR_setuid 146
+#define __NR_setresuid 147
+#define __NR_getresuid 148
+#define __NR_setresgid 149
+#define __NR_getresgid 150
+#define __NR_setfsuid 151
+#define __NR_setfsgid 152
+#define __NR_times 153
+#define __NR_setpgid 154
+#define __NR_getpgid 155
+#define __NR_getsid 156
+#define __NR_setsid 157
+#define __NR_getgroups 158
+#define __NR_setgroups 159
+#define __NR_uname 160
+#define __NR_sethostname 161
+#define __NR_setdomainname 162
+#define __NR_getrusage 165
+#define __NR_umask 166
+#define __NR_prctl 167
+#define __NR_getcpu 168
+#define __NR_getpid 172
+#define __NR_getppid 173
+#define __NR_getuid 174
+#define __NR_geteuid 175
+#define __NR_getgid 176
+#define __NR_getegid 177
+#define __NR_gettid 178
+#define __NR_sysinfo 179
+#define __NR_mq_open 180
+#define __NR_mq_unlink 181
+#define __NR_mq_notify 184
+#define __NR_mq_getsetattr 185
+#define __NR_msgget 186
+#define __NR_msgctl 187
+#define __NR_msgrcv 188
+#define __NR_msgsnd 189
+#define __NR_semget 190
+#define __NR_semctl 191
+#define __NR_semop 193
+#define __NR_shmget 194
+#define __NR_shmctl 195
+#define __NR_shmat 196
+#define __NR_shmdt 197
+#define __NR_socket 198
+#define __NR_socketpair 199
+#define __NR_bind 200
+#define __NR_listen 201
+#define __NR_accept 202
+#define __NR_connect 203
+#define __NR_getsockname 204
+#define __NR_getpeername 205
+#define __NR_sendto 206
+#define __NR_recvfrom 207
+#define __NR_setsockopt 208
+#define __NR_getsockopt 209
+#define __NR_shutdown 210
+#define __NR_sendmsg 211
+#define __NR_recvmsg 212
+#define __NR_readahead 213
+#define __NR_brk 214
+#define __NR_munmap 215
+#define __NR_mremap 216
+#define __NR_add_key 217
+#define __NR_request_key 218
+#define __NR_keyctl 219
+#define __NR_clone 220
+#define __NR_execve 221
+#define __NR_mmap2 222
+#define __NR_fadvise64_64 223
+#define __NR_swapon 224
+#define __NR_swapoff 225
+#define __NR_mprotect 226
+#define __NR_msync 227
+#define __NR_mlock 228
+#define __NR_munlock 229
+#define __NR_mlockall 230
+#define __NR_munlockall 231
+#define __NR_mincore 232
+#define __NR_madvise 233
+#define __NR_remap_file_pages 234
+#define __NR_mbind 235
+#define __NR_get_mempolicy 236
+#define __NR_set_mempolicy 237
+#define __NR_migrate_pages 238
+#define __NR_move_pages 239
+#define __NR_rt_tgsigqueueinfo 240
+#define __NR_perf_event_open 241
+#define __NR_accept4 242
+#define __NR_arch_specific_syscall 244
+#define __NR_prlimit64 261
+#define __NR_fanotify_init 262
+#define __NR_fanotify_mark 263
+#define __NR_name_to_handle_at 264
+#define __NR_open_by_handle_at 265
+#define __NR_syncfs 267
+#define __NR_setns 268
+#define __NR_sendmmsg 269
+#define __NR_process_vm_readv 270
+#define __NR_process_vm_writev 271
+#define __NR_kcmp 272
+#define __NR_finit_module 273
+#define __NR_sched_setattr 274
+#define __NR_sched_getattr 275
+#define __NR_renameat2 276
+#define __NR_seccomp 277
+#define __NR_getrandom 278
+#define __NR_memfd_create 279
+#define __NR_bpf 280
+#define __NR_execveat 281
+#define __NR_userfaultfd 282
+#define __NR_membarrier 283
+#define __NR_mlock2 284
+#define __NR_copy_file_range 285
+#define __NR_preadv2 286
+#define __NR_pwritev2 287
+#define __NR_pkey_mprotect 288
+#define __NR_pkey_alloc 289
+#define __NR_pkey_free 290
+#define __NR_statx 291
+#define __NR_rseq 293
+#define __NR_kexec_file_load 294
+#define __NR_clock_gettime64		403
+#define __NR_clock_settime64		404
+#define __NR_clock_adjtime64		405
+#define __NR_clock_getres_time64	406
+#define __NR_clock_nanosleep_time64	407
+#define __NR_timer_gettime64		408
+#define __NR_timer_settime64		409
+#define __NR_timerfd_gettime64		410
+#define __NR_timerfd_settime64		411
+#define __NR_utimensat_time64		412
+#define __NR_pselect6_time64		413
+#define __NR_ppoll_time64		414
+#define __NR_io_pgetevents_time64	416
+#define __NR_recvmmsg_time64		417
+#define __NR_mq_timedsend_time64	418
+#define __NR_mq_timedreceive_time64	419
+#define __NR_semtimedop_time64		420
+#define __NR_rt_sigtimedwait_time64	421
+#define __NR_futex_time64		422
+#define __NR_sched_rr_get_interval_time64 423
+#define __NR_pidfd_send_signal 424
+#define __NR_io_uring_setup 425
+#define __NR_io_uring_enter 426
+#define __NR_io_uring_register 427
+#define __NR_open_tree		428
+#define __NR_move_mount		429
+#define __NR_fsopen		430
+#define __NR_fsconfig		431
+#define __NR_fsmount		432
+#define __NR_fspick		433
+#define __NR_pidfd_open		434
+#define __NR_clone3		435
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
+#define __NR_futex __NR_futex_time64
+
+#define __NR_sysriscv __NR_arch_specific_syscall
+#define __NR_riscv_flush_icache (__NR_sysriscv + 15)
diff --git a/arch/riscv32/bits/user.h b/arch/riscv32/bits/user.h
new file mode 100644
index 00000000..0d37de0b
--- /dev/null
+++ b/arch/riscv32/bits/user.h
@@ -0,0 +1,6 @@
+#include <signal.h>
+
+#define ELF_NGREG 32
+#define ELF_NFPREG 33
+typedef unsigned long elf_greg_t, elf_gregset_t[ELF_NGREG];
+typedef union __riscv_mc_fp_state elf_fpregset_t;
diff --git a/arch/riscv32/crt_arch.h b/arch/riscv32/crt_arch.h
new file mode 100644
index 00000000..6b93fcfd
--- /dev/null
+++ b/arch/riscv32/crt_arch.h
@@ -0,0 +1,19 @@
+__asm__(
+".section .sdata,\"aw\"\n"
+".text\n"
+".global " START "\n"
+".type " START ",%function\n"
+START ":\n"
+".weak __global_pointer$\n"
+".hidden __global_pointer$\n"
+".option push\n"
+".option norelax\n\t"
+"lla gp, __global_pointer$\n"
+".option pop\n\t"
+"mv a0, sp\n"
+".weak _DYNAMIC\n"
+".hidden _DYNAMIC\n\t"
+"lla a1, _DYNAMIC\n\t"
+"andi sp, sp, -16\n\t"
+"tail " START "_c"
+);
diff --git a/arch/riscv32/kstat.h b/arch/riscv32/kstat.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/arch/riscv32/kstat.h
diff --git a/arch/riscv32/pthread_arch.h b/arch/riscv32/pthread_arch.h
new file mode 100644
index 00000000..a20d7fba
--- /dev/null
+++ b/arch/riscv32/pthread_arch.h
@@ -0,0 +1,13 @@
+static inline uintptr_t __get_tp()
+{
+	uintptr_t tp;
+	__asm__ __volatile__("mv %0, tp" : "=r"(tp));
+	return tp;
+}
+
+#define TLS_ABOVE_TP
+#define GAP_ABOVE_TP 0
+
+#define DTP_OFFSET 0x800
+
+#define MC_PC __gregs[0]
diff --git a/arch/riscv32/reloc.h b/arch/riscv32/reloc.h
new file mode 100644
index 00000000..59d15f17
--- /dev/null
+++ b/arch/riscv32/reloc.h
@@ -0,0 +1,22 @@
+#if defined __riscv_float_abi_soft
+#define RISCV_FP_SUFFIX "-sf"
+#elif defined __riscv_float_abi_single
+#define RISCV_FP_SUFFIX "-sp"
+#elif defined __riscv_float_abi_double
+#define RISCV_FP_SUFFIX ""
+#endif
+
+#define LDSO_ARCH "riscv32" RISCV_FP_SUFFIX
+
+#define TPOFF_K 0
+
+#define REL_SYMBOLIC    R_RISCV_32
+#define REL_PLT         R_RISCV_JUMP_SLOT
+#define REL_RELATIVE    R_RISCV_RELATIVE
+#define REL_COPY        R_RISCV_COPY
+#define REL_DTPMOD      R_RISCV_TLS_DTPMOD32
+#define REL_DTPOFF      R_RISCV_TLS_DTPREL32
+#define REL_TPOFF       R_RISCV_TLS_TPREL32
+
+#define CRTJMP(pc,sp) __asm__ __volatile__( \
+	"mv sp, %1 ; jr %0" : : "r"(pc), "r"(sp) : "memory" )
diff --git a/arch/riscv32/syscall_arch.h b/arch/riscv32/syscall_arch.h
new file mode 100644
index 00000000..70d2773f
--- /dev/null
+++ b/arch/riscv32/syscall_arch.h
@@ -0,0 +1,79 @@
+#define __SYSCALL_LL_E(x) \
+((union { long long ll; long l[2]; }){ .ll = x }).l[0], \
+((union { long long ll; long l[2]; }){ .ll = x }).l[1]
+#define __SYSCALL_LL_O(x) __SYSCALL_LL_E((x))
+
+#define __asm_syscall(...) \
+	__asm__ __volatile__ ("ecall\n\t" \
+	: "=r"(a0) : __VA_ARGS__ : "memory"); \
+	return a0; \
+
+static inline long __syscall0(long n)
+{
+	register long a7 __asm__("a7") = n;
+	register long a0 __asm__("a0");
+	__asm_syscall("r"(a7))
+}
+
+static inline long __syscall1(long n, long a)
+{
+	register long a7 __asm__("a7") = n;
+	register long a0 __asm__("a0") = a;
+	__asm_syscall("r"(a7), "0"(a0))
+}
+
+static inline long __syscall2(long n, long a, long b)
+{
+	register long a7 __asm__("a7") = n;
+	register long a0 __asm__("a0") = a;
+	register long a1 __asm__("a1") = b;
+	__asm_syscall("r"(a7), "0"(a0), "r"(a1))
+}
+
+static inline long __syscall3(long n, long a, long b, long c)
+{
+	register long a7 __asm__("a7") = n;
+	register long a0 __asm__("a0") = a;
+	register long a1 __asm__("a1") = b;
+	register long a2 __asm__("a2") = c;
+	__asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2))
+}
+
+static inline long __syscall4(long n, long a, long b, long c, long d)
+{
+	register long a7 __asm__("a7") = n;
+	register long a0 __asm__("a0") = a;
+	register long a1 __asm__("a1") = b;
+	register long a2 __asm__("a2") = c;
+	register long a3 __asm__("a3") = d;
+	__asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2), "r"(a3))
+}
+
+static inline long __syscall5(long n, long a, long b, long c, long d, long e)
+{
+	register long a7 __asm__("a7") = n;
+	register long a0 __asm__("a0") = a;
+	register long a1 __asm__("a1") = b;
+	register long a2 __asm__("a2") = c;
+	register long a3 __asm__("a3") = d;
+	register long a4 __asm__("a4") = e;
+	__asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4))
+}
+
+static inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
+{
+	register long a7 __asm__("a7") = n;
+	register long a0 __asm__("a0") = a;
+	register long a1 __asm__("a1") = b;
+	register long a2 __asm__("a2") = c;
+	register long a3 __asm__("a3") = d;
+	register long a4 __asm__("a4") = e;
+	register long a5 __asm__("a5") = f;
+	__asm_syscall("r"(a7), "0"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4), "r"(a5))
+}
+
+#define VDSO_USEFUL
+#define VDSO_CGT_SYM "__vdso_clock_gettime"
+#define VDSO_CGT_VER "LINUX_4.15"
+
+#define IPC_64 0
diff --git a/arch/riscv64/bits/fcntl.h b/arch/riscv64/bits/fcntl.h
deleted file mode 100644
index ecb4d18f..00000000
--- a/arch/riscv64/bits/fcntl.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#define O_CREAT        0100
-#define O_EXCL         0200
-#define O_NOCTTY       0400
-#define O_TRUNC       01000
-#define O_APPEND      02000
-#define O_NONBLOCK    04000
-#define O_DSYNC      010000
-#define O_SYNC     04010000
-#define O_RSYNC    04010000
-#define O_DIRECTORY 0200000
-#define O_NOFOLLOW  0400000
-#define O_CLOEXEC  02000000
-
-#define O_ASYNC      020000
-#define O_DIRECT     040000
-#define O_LARGEFILE 0100000
-#define O_NOATIME  01000000
-#define O_PATH    010000000
-#define O_TMPFILE 020200000
-#define O_NDELAY O_NONBLOCK
-
-#define F_DUPFD  0
-#define F_GETFD  1
-#define F_SETFD  2
-#define F_GETFL  3
-#define F_SETFL  4
-#define F_GETLK  5
-#define F_SETLK  6
-#define F_SETLKW 7
-#define F_SETOWN 8
-#define F_GETOWN 9
-#define F_SETSIG 10
-#define F_GETSIG 11
-
-#define F_SETOWN_EX 15
-#define F_GETOWN_EX 16
-
-#define F_GETOWNER_UIDS 17
diff --git a/arch/riscv64/bits/posix.h b/arch/riscv64/bits/posix.h
deleted file mode 100644
index 8068ce98..00000000
--- a/arch/riscv64/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_LP64_OFF64 1
-#define _POSIX_V7_LP64_OFF64 1
diff --git a/arch/riscv64/bits/reg.h b/arch/riscv64/bits/reg.h
deleted file mode 100644
index 2633f39d..00000000
--- a/arch/riscv64/bits/reg.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
diff --git a/arch/riscv64/bits/signal.h b/arch/riscv64/bits/signal.h
index b006334f..56f8fe17 100644
--- a/arch/riscv64/bits/signal.h
+++ b/arch/riscv64/bits/signal.h
@@ -19,7 +19,7 @@ struct __riscv_mc_d_ext_state {
 };
 
 struct __riscv_mc_q_ext_state {
-	unsigned long long __f[64] __attribute__((aligned(16)));
+	unsigned long long __f[64] __attribute__((__aligned__(16)));
 	unsigned int __fcsr;
 	unsigned int __reserved[3];
 };
@@ -41,7 +41,9 @@ typedef struct mcontext_t {
 #define REG_SP 2
 #define REG_TP 4
 #define REG_S0 8
+#define REG_S1 9
 #define REG_A0 10
+#define REG_S2 18
 #endif
 
 #if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
@@ -60,10 +62,10 @@ struct sigaltstack {
 	size_t ss_size;
 };
 
-typedef struct ucontext_t
+typedef struct __ucontext
 {
 	unsigned long uc_flags;
-	struct ucontext_t *uc_link;
+	struct __ucontext *uc_link;
 	stack_t uc_stack;
 	sigset_t uc_sigmask;
 	mcontext_t uc_mcontext;
@@ -76,7 +78,6 @@ typedef struct ucontext_t
 #define SA_RESTART   0x10000000
 #define SA_NODEFER   0x40000000
 #define SA_RESETHAND 0x80000000
-#define SA_RESTORER  0x04000000
 
 #endif
 
diff --git a/arch/riscv64/bits/stdint.h b/arch/riscv64/bits/stdint.h
deleted file mode 100644
index 1bb147f2..00000000
--- a/arch/riscv64/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT64_MIN
-#define INTPTR_MAX      INT64_MAX
-#define UINTPTR_MAX     UINT64_MAX
-#define PTRDIFF_MIN     INT64_MIN
-#define PTRDIFF_MAX     INT64_MAX
-#define SIZE_MAX        UINT64_MAX
diff --git a/arch/riscv64/bits/syscall.h.in b/arch/riscv64/bits/syscall.h.in
index 0043eeba..e362bd0e 100644
--- a/arch/riscv64/bits/syscall.h.in
+++ b/arch/riscv64/bits/syscall.h.in
@@ -76,7 +76,7 @@
 #define __NR_splice 76
 #define __NR_tee 77
 #define __NR_readlinkat 78
-#define __NR_fstatat 79
+#define __NR_newfstatat 79
 #define __NR_fstat 80
 #define __NR_sync 81
 #define __NR_fsync 82
@@ -289,6 +289,21 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
 #define __NR_sysriscv __NR_arch_specific_syscall
 #define __NR_riscv_flush_icache (__NR_sysriscv + 15)
diff --git a/arch/riscv64/bits/user.h b/arch/riscv64/bits/user.h
index 2da743ea..0d37de0b 100644
--- a/arch/riscv64/bits/user.h
+++ b/arch/riscv64/bits/user.h
@@ -1,5 +1,6 @@
 #include <signal.h>
 
 #define ELF_NGREG 32
+#define ELF_NFPREG 33
 typedef unsigned long elf_greg_t, elf_gregset_t[ELF_NGREG];
 typedef union __riscv_mc_fp_state elf_fpregset_t;
diff --git a/arch/riscv64/pthread_arch.h b/arch/riscv64/pthread_arch.h
index db414b17..a20d7fba 100644
--- a/arch/riscv64/pthread_arch.h
+++ b/arch/riscv64/pthread_arch.h
@@ -1,13 +1,12 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	char *tp;
+	uintptr_t tp;
 	__asm__ __volatile__("mv %0, tp" : "=r"(tp));
-	return (void *)(tp - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)p + sizeof(struct pthread))
 
 #define DTP_OFFSET 0x800
 
diff --git a/arch/riscv64/reloc.h b/arch/riscv64/reloc.h
index 1ca13811..7c7c0611 100644
--- a/arch/riscv64/reloc.h
+++ b/arch/riscv64/reloc.h
@@ -17,6 +17,7 @@
 #define REL_DTPMOD      R_RISCV_TLS_DTPMOD64
 #define REL_DTPOFF      R_RISCV_TLS_DTPREL64
 #define REL_TPOFF       R_RISCV_TLS_TPREL64
+#define REL_TLSDESC     R_RISCV_TLSDESC
 
 #define CRTJMP(pc,sp) __asm__ __volatile__( \
 	"mv sp, %1 ; jr %0" : : "r"(pc), "r"(sp) : "memory" )
diff --git a/arch/riscv64/syscall_arch.h b/arch/riscv64/syscall_arch.h
index 7fd042cd..81993fc8 100644
--- a/arch/riscv64/syscall_arch.h
+++ b/arch/riscv64/syscall_arch.h
@@ -71,8 +71,7 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 }
 
 #define VDSO_USEFUL
-/* We don't have a clock_gettime function.
 #define VDSO_CGT_SYM "__vdso_clock_gettime"
-#define VDSO_CGT_VER "LINUX_2.6" */
+#define VDSO_CGT_VER "LINUX_4.15"
 
 #define IPC_64 0
diff --git a/arch/s390x/bits/alltypes.h.in b/arch/s390x/bits/alltypes.h.in
index 15d18c8f..6c0eb7f4 100644
--- a/arch/s390x/bits/alltypes.h.in
+++ b/arch/s390x/bits/alltypes.h.in
@@ -9,7 +9,11 @@
 TYPEDEF int wchar_t;
 #endif
 
+#if defined(__FLT_EVAL_METHOD__) && __FLT_EVAL_METHOD__ == 1
 TYPEDEF double float_t;
+#else
+TYPEDEF float float_t;
+#endif
 TYPEDEF double double_t;
 
 TYPEDEF struct { long long __ll; long double __ld; } max_align_t;
diff --git a/arch/s390x/bits/float.h b/arch/s390x/bits/float.h
index 90b73bee..e188cb61 100644
--- a/arch/s390x/bits/float.h
+++ b/arch/s390x/bits/float.h
@@ -1,4 +1,8 @@
-#define FLT_EVAL_METHOD 1
+#ifdef __FLT_EVAL_METHOD__
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#else
+#define FLT_EVAL_METHOD 0
+#endif
 
 #define LDBL_TRUE_MIN 6.47517511943802511092443895822764655e-4966L
 #define LDBL_MIN 3.36210314311209350626267781732175260e-4932L
diff --git a/arch/s390x/bits/posix.h b/arch/s390x/bits/posix.h
deleted file mode 100644
index c37b94c1..00000000
--- a/arch/s390x/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_LP64_OFF64  1
-#define _POSIX_V7_LP64_OFF64  1
diff --git a/arch/s390x/bits/ptrace.h b/arch/s390x/bits/ptrace.h
index d50e3262..a06cb077 100644
--- a/arch/s390x/bits/ptrace.h
+++ b/arch/s390x/bits/ptrace.h
@@ -1,4 +1,7 @@
 #define PTRACE_SINGLEBLOCK		12
+#define PTRACE_OLDSETOPTIONS		21
+#define PTRACE_SYSEMU			31
+#define PTRACE_SYSEMU_SINGLESTEP	32
 #define PTRACE_PEEKUSR_AREA		0x5000
 #define PTRACE_POKEUSR_AREA		0x5001
 #define PTRACE_GET_LAST_BREAK		0x5006
diff --git a/arch/s390x/bits/reg.h b/arch/s390x/bits/reg.h
deleted file mode 100644
index 2633f39d..00000000
--- a/arch/s390x/bits/reg.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
diff --git a/arch/s390x/bits/stdint.h b/arch/s390x/bits/stdint.h
deleted file mode 100644
index 1bb147f2..00000000
--- a/arch/s390x/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT64_MIN
-#define INTPTR_MAX      INT64_MAX
-#define UINTPTR_MAX     UINT64_MAX
-#define PTRDIFF_MIN     INT64_MIN
-#define PTRDIFF_MAX     INT64_MAX
-#define SIZE_MAX        UINT64_MAX
diff --git a/arch/s390x/bits/syscall.h.in b/arch/s390x/bits/syscall.h.in
index e89f3782..e60711a6 100644
--- a/arch/s390x/bits/syscall.h.in
+++ b/arch/s390x/bits/syscall.h.in
@@ -352,4 +352,20 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_memfd_secret	447
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
diff --git a/arch/s390x/bits/user.h b/arch/s390x/bits/user.h
index ff3f0483..47f94f20 100644
--- a/arch/s390x/bits/user.h
+++ b/arch/s390x/bits/user.h
@@ -1,6 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
-
 typedef union {
 	double d;
 	float f;
diff --git a/arch/s390x/pthread_arch.h b/arch/s390x/pthread_arch.h
index e2251f1f..e54fec3f 100644
--- a/arch/s390x/pthread_arch.h
+++ b/arch/s390x/pthread_arch.h
@@ -1,14 +1,12 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
+	uintptr_t tp;
 	__asm__ (
 		"ear  %0, %%a0\n"
 		"sllg %0, %0, 32\n"
 		"ear  %0, %%a1\n"
-		: "=r"(self));
-	return self;
+		: "=r"(tp));
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC psw.addr
diff --git a/arch/s390x/reloc.h b/arch/s390x/reloc.h
index 6e5c1fb8..38de9d9b 100644
--- a/arch/s390x/reloc.h
+++ b/arch/s390x/reloc.h
@@ -10,4 +10,4 @@
 #define REL_TPOFF       R_390_TLS_TPOFF
 
 #define CRTJMP(pc,sp) __asm__ __volatile__( \
-	"lgr %%r15,%1; br %0" : : "r"(pc), "r"(sp) : "memory" )
+	"lgr %%r15,%1; br %0" : : "a"(pc), "r"(sp) : "memory" )
diff --git a/arch/s390x/syscall_arch.h b/arch/s390x/syscall_arch.h
index afb99852..d1cfd2e8 100644
--- a/arch/s390x/syscall_arch.h
+++ b/arch/s390x/syscall_arch.h
@@ -73,4 +73,6 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 	__asm_syscall("+r"(r2), "r"(r1), "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7));
 }
 
-#define SYSCALL_USE_SOCKETCALL
+#define VDSO_USEFUL
+#define VDSO_CGT_SYM "__kernel_clock_gettime"
+#define VDSO_CGT_VER "LINUX_2.6.29"
diff --git a/arch/sh/bits/posix.h b/arch/sh/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/sh/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/sh/bits/signal.h b/arch/sh/bits/signal.h
index 160311fa..d0b14828 100644
--- a/arch/sh/bits/signal.h
+++ b/arch/sh/bits/signal.h
@@ -9,7 +9,16 @@
 #if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
 typedef int greg_t, gregset_t[16];
 typedef int freg_t, fpregset_t[16];
-typedef struct sigcontext {
+typedef struct {
+	unsigned long oldmask;
+	unsigned long gregs[16];
+	unsigned long pc, pr, sr;
+	unsigned long gbr, mach, macl;
+	unsigned long fpregs[16];
+	unsigned long xfpregs[16];
+	unsigned int fpscr, fpul, ownedfp;
+} mcontext_t;
+struct sigcontext {
 	unsigned long oldmask;
 	unsigned long sc_regs[16];
 	unsigned long sc_pc, sc_pr, sc_sr;
@@ -17,7 +26,7 @@ typedef struct sigcontext {
 	unsigned long sc_fpregs[16];
 	unsigned long sc_xfpregs[16];
 	unsigned int sc_fpscr, sc_fpul, sc_ownedfp;
-} mcontext_t;
+};
 #else
 typedef struct {
 	unsigned long __regs[58];
diff --git a/arch/sh/bits/stdint.h b/arch/sh/bits/stdint.h
deleted file mode 100644
index d1b27121..00000000
--- a/arch/sh/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT32_MIN
-#define INTPTR_MAX      INT32_MAX
-#define UINTPTR_MAX     UINT32_MAX
-#define PTRDIFF_MIN     INT32_MIN
-#define PTRDIFF_MAX     INT32_MAX
-#define SIZE_MAX        UINT32_MAX
diff --git a/arch/sh/bits/syscall.h.in b/arch/sh/bits/syscall.h.in
index 0102ddaf..915a79cd 100644
--- a/arch/sh/bits/syscall.h.in
+++ b/arch/sh/bits/syscall.h.in
@@ -398,4 +398,20 @@
 #define __NR_fsmount		432
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
+#define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
diff --git a/arch/sh/bits/user.h b/arch/sh/bits/user.h
index 07fe843b..b6ba16ed 100644
--- a/arch/sh/bits/user.h
+++ b/arch/sh/bits/user.h
@@ -1,6 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 32
-
 #define REG_REG0	 0
 #define REG_REG15	15
 #define REG_PC		16
diff --git a/arch/sh/pthread_arch.h b/arch/sh/pthread_arch.h
index 3ee9c1a9..199c2d55 100644
--- a/arch/sh/pthread_arch.h
+++ b/arch/sh/pthread_arch.h
@@ -1,17 +1,16 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	char *self;
-	__asm__ ("stc gbr,%0" : "=r" (self) );
-	return (struct pthread *) (self - sizeof(struct pthread));
+	uintptr_t tp;
+	__asm__ ("stc gbr,%0" : "=r" (tp) );
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 8
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread))
 
-#define MC_PC sc_pc
+#define MC_PC pc
 
 #ifdef __FDPIC__
-#define MC_GOT sc_regs[12]
+#define MC_GOT gregs[12]
 #define CANCEL_GOT (*(uintptr_t *)((char *)__syscall_cp_asm+sizeof(uintptr_t)))
 #endif
diff --git a/arch/x32/bits/fcntl.h b/arch/x32/bits/fcntl.h
index 1b88ad39..08627f81 100644
--- a/arch/x32/bits/fcntl.h
+++ b/arch/x32/bits/fcntl.h
@@ -13,7 +13,7 @@
 
 #define O_ASYNC      020000
 #define O_DIRECT     040000
-#define O_LARGEFILE       0
+#define O_LARGEFILE 0100000
 #define O_NOATIME  01000000
 #define O_PATH    010000000
 #define O_TMPFILE 020200000
diff --git a/arch/x32/bits/posix.h b/arch/x32/bits/posix.h
deleted file mode 100644
index 30a38714..00000000
--- a/arch/x32/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/x32/bits/reg.h b/arch/x32/bits/reg.h
index 5faaef1a..6e54abcf 100644
--- a/arch/x32/bits/reg.h
+++ b/arch/x32/bits/reg.h
@@ -1,5 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 32
 #define R15    0
 #define R14    1
 #define R13    2
diff --git a/arch/x32/bits/stdint.h b/arch/x32/bits/stdint.h
deleted file mode 100644
index d1b27121..00000000
--- a/arch/x32/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT32_MIN
-#define INTPTR_MAX      INT32_MAX
-#define UINTPTR_MAX     UINT32_MAX
-#define PTRDIFF_MIN     INT32_MIN
-#define PTRDIFF_MAX     INT32_MAX
-#define SIZE_MAX        UINT32_MAX
diff --git a/arch/x32/bits/syscall.h.in b/arch/x32/bits/syscall.h.in
index f47bdee5..1d065eea 100644
--- a/arch/x32/bits/syscall.h.in
+++ b/arch/x32/bits/syscall.h.in
@@ -298,6 +298,22 @@
 #define __NR_fspick		(0x40000000 + 433)
 #define __NR_pidfd_open		(0x40000000 + 434)
 #define __NR_clone3		(0x40000000 + 435)
+#define __NR_close_range	(0x40000000 + 436)
+#define __NR_openat2		(0x40000000 + 437)
+#define __NR_pidfd_getfd	(0x40000000 + 438)
+#define __NR_faccessat2		(0x40000000 + 439)
+#define __NR_process_madvise	(0x40000000 + 440)
+#define __NR_epoll_pwait2	(0x40000000 + 441)
+#define __NR_mount_setattr	(0x40000000 + 442)
+#define __NR_landlock_create_ruleset	(0x40000000 + 444)
+#define __NR_landlock_add_rule	(0x40000000 + 445)
+#define __NR_landlock_restrict_self	(0x40000000 + 446)
+#define __NR_memfd_secret	(0x40000000 + 447)
+#define __NR_process_mrelease	(0x40000000 + 448)
+#define __NR_futex_waitv	(0x40000000 + 449)
+#define __NR_set_mempolicy_home_node	(0x40000000 + 450)
+#define __NR_cachestat		(0x40000000 + 451)
+#define __NR_fchmodat2		(0x40000000 + 452)
 
 
 #define __NR_rt_sigaction (0x40000000 + 512)
diff --git a/arch/x32/bits/user.h b/arch/x32/bits/user.h
index 4073cc06..b328edf9 100644
--- a/arch/x32/bits/user.h
+++ b/arch/x32/bits/user.h
@@ -1,6 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
-
 typedef struct user_fpregs_struct {
 	uint16_t cwd, swd, ftw, fop;
 	uint64_t rip, rdp;
diff --git a/arch/x32/crt_arch.h b/arch/x32/crt_arch.h
index 3eec61bd..b1c9c476 100644
--- a/arch/x32/crt_arch.h
+++ b/arch/x32/crt_arch.h
@@ -1,6 +1,7 @@
 __asm__(
 ".text \n"
 ".global " START " \n"
+".type " START ",%function \n"
 START ": \n"
 "	xor %rbp,%rbp \n"
 "	mov %rsp,%rdi \n"
diff --git a/arch/x32/pthread_arch.h b/arch/x32/pthread_arch.h
index f640a1a1..c1e7716d 100644
--- a/arch/x32/pthread_arch.h
+++ b/arch/x32/pthread_arch.h
@@ -1,14 +1,12 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
-	__asm__ ("mov %%fs:0,%0" : "=r" (self) );
-	return self;
+	uintptr_t tp;
+	__asm__ ("mov %%fs:0,%0" : "=r" (tp) );
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC gregs[REG_RIP]
 
-#define CANARY canary2
+#define CANARY_PAD
 
 #define tls_mod_off_t unsigned long long
diff --git a/arch/x86_64/bits/fcntl.h b/arch/x86_64/bits/fcntl.h
deleted file mode 100644
index 1b88ad39..00000000
--- a/arch/x86_64/bits/fcntl.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#define O_CREAT        0100
-#define O_EXCL         0200
-#define O_NOCTTY       0400
-#define O_TRUNC       01000
-#define O_APPEND      02000
-#define O_NONBLOCK    04000
-#define O_DSYNC      010000
-#define O_SYNC     04010000
-#define O_RSYNC    04010000
-#define O_DIRECTORY 0200000
-#define O_NOFOLLOW  0400000
-#define O_CLOEXEC  02000000
-
-#define O_ASYNC      020000
-#define O_DIRECT     040000
-#define O_LARGEFILE       0
-#define O_NOATIME  01000000
-#define O_PATH    010000000
-#define O_TMPFILE 020200000
-#define O_NDELAY O_NONBLOCK
-
-#define F_DUPFD  0
-#define F_GETFD  1
-#define F_SETFD  2
-#define F_GETFL  3
-#define F_SETFL  4
-
-#define F_SETOWN 8
-#define F_GETOWN 9
-#define F_SETSIG 10
-#define F_GETSIG 11
-
-#define F_GETLK 5
-#define F_SETLK 6
-#define F_SETLKW 7
-
-#define F_SETOWN_EX 15
-#define F_GETOWN_EX 16
-
-#define F_GETOWNER_UIDS 17
diff --git a/arch/x86_64/bits/posix.h b/arch/x86_64/bits/posix.h
deleted file mode 100644
index c37b94c1..00000000
--- a/arch/x86_64/bits/posix.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define _POSIX_V6_LP64_OFF64  1
-#define _POSIX_V7_LP64_OFF64  1
diff --git a/arch/x86_64/bits/reg.h b/arch/x86_64/bits/reg.h
index a4df04ce..6e54abcf 100644
--- a/arch/x86_64/bits/reg.h
+++ b/arch/x86_64/bits/reg.h
@@ -1,5 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
 #define R15    0
 #define R14    1
 #define R13    2
diff --git a/arch/x86_64/bits/stdint.h b/arch/x86_64/bits/stdint.h
deleted file mode 100644
index 1bb147f2..00000000
--- a/arch/x86_64/bits/stdint.h
+++ /dev/null
@@ -1,20 +0,0 @@
-typedef int32_t int_fast16_t;
-typedef int32_t int_fast32_t;
-typedef uint32_t uint_fast16_t;
-typedef uint32_t uint_fast32_t;
-
-#define INT_FAST16_MIN  INT32_MIN
-#define INT_FAST32_MIN  INT32_MIN
-
-#define INT_FAST16_MAX  INT32_MAX
-#define INT_FAST32_MAX  INT32_MAX
-
-#define UINT_FAST16_MAX UINT32_MAX
-#define UINT_FAST32_MAX UINT32_MAX
-
-#define INTPTR_MIN      INT64_MIN
-#define INTPTR_MAX      INT64_MAX
-#define UINTPTR_MAX     UINT64_MAX
-#define PTRDIFF_MIN     INT64_MIN
-#define PTRDIFF_MAX     INT64_MAX
-#define SIZE_MAX        UINT64_MAX
diff --git a/arch/x86_64/bits/syscall.h.in b/arch/x86_64/bits/syscall.h.in
index 6a646ad3..6543bbba 100644
--- a/arch/x86_64/bits/syscall.h.in
+++ b/arch/x86_64/bits/syscall.h.in
@@ -345,4 +345,20 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
+#define __NR_process_madvise	440
+#define __NR_epoll_pwait2	441
+#define __NR_mount_setattr	442
+#define __NR_landlock_create_ruleset	444
+#define __NR_landlock_add_rule	445
+#define __NR_landlock_restrict_self	446
+#define __NR_memfd_secret	447
+#define __NR_process_mrelease	448
+#define __NR_futex_waitv	449
+#define __NR_set_mempolicy_home_node	450
+#define __NR_cachestat		451
+#define __NR_fchmodat2		452
 
diff --git a/arch/x86_64/bits/user.h b/arch/x86_64/bits/user.h
index 4073cc06..b328edf9 100644
--- a/arch/x86_64/bits/user.h
+++ b/arch/x86_64/bits/user.h
@@ -1,6 +1,3 @@
-#undef __WORDSIZE
-#define __WORDSIZE 64
-
 typedef struct user_fpregs_struct {
 	uint16_t cwd, swd, ftw, fop;
 	uint64_t rip, rdp;
diff --git a/arch/x86_64/crt_arch.h b/arch/x86_64/crt_arch.h
index 3eec61bd..b1c9c476 100644
--- a/arch/x86_64/crt_arch.h
+++ b/arch/x86_64/crt_arch.h
@@ -1,6 +1,7 @@
 __asm__(
 ".text \n"
 ".global " START " \n"
+".type " START ",%function \n"
 START ": \n"
 "	xor %rbp,%rbp \n"
 "	mov %rsp,%rdi \n"
diff --git a/arch/x86_64/pthread_arch.h b/arch/x86_64/pthread_arch.h
index 65e880c6..c8c63f2e 100644
--- a/arch/x86_64/pthread_arch.h
+++ b/arch/x86_64/pthread_arch.h
@@ -1,10 +1,8 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
-	__asm__ ("mov %%fs:0,%0" : "=r" (self) );
-	return self;
+	uintptr_t tp;
+	__asm__ ("mov %%fs:0,%0" : "=r" (tp) );
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC gregs[REG_RIP]
diff --git a/compat/time32/__xstat.c b/compat/time32/__xstat.c
index acfbd3cc..e52b5deb 100644
--- a/compat/time32/__xstat.c
+++ b/compat/time32/__xstat.c
@@ -3,22 +3,22 @@
 
 struct stat32;
 
-int __fxstat64(int ver, int fd, struct stat32 *buf)
+int __fxstat(int ver, int fd, struct stat32 *buf)
 {
 	return __fstat_time32(fd, buf);
 }
 
-int __fxstatat64(int ver, int fd, const char *path, struct stat32 *buf, int flag)
+int __fxstatat(int ver, int fd, const char *path, struct stat32 *buf, int flag)
 {
 	return __fstatat_time32(fd, path, buf, flag);
 }
 
-int __lxstat64(int ver, const char *path, struct stat32 *buf)
+int __lxstat(int ver, const char *path, struct stat32 *buf)
 {
 	return __lstat_time32(path, buf);
 }
 
-int __xstat64(int ver, const char *path, struct stat32 *buf)
+int __xstat(int ver, const char *path, struct stat32 *buf)
 {
 	return __stat_time32(path, buf);
 }
diff --git a/compat/time32/aio_suspend_time32.c b/compat/time32/aio_suspend_time32.c
index ed5119bd..d99cb651 100644
--- a/compat/time32/aio_suspend_time32.c
+++ b/compat/time32/aio_suspend_time32.c
@@ -7,5 +7,3 @@ int __aio_suspend_time32(const struct aiocb *const cbs[], int cnt, const struct
 	return aio_suspend(cbs, cnt, ts32 ? (&(struct timespec){
 		.tv_sec = ts32->tv_sec, .tv_nsec = ts32->tv_nsec}) : 0);
 }
-
-weak_alias(aio_suspend, aio_suspend64);
diff --git a/compat/time32/fstat_time32.c b/compat/time32/fstat_time32.c
index 3e084398..e5d52022 100644
--- a/compat/time32/fstat_time32.c
+++ b/compat/time32/fstat_time32.c
@@ -13,5 +13,3 @@ int __fstat_time32(int fd, struct stat32 *restrict st32)
 	if (!r) memcpy(st32, &st, offsetof(struct stat, st_atim));
 	return r;
 }
-
-weak_alias(fstat, fstat64);
diff --git a/compat/time32/fstatat_time32.c b/compat/time32/fstatat_time32.c
index 85dcb008..31d42e63 100644
--- a/compat/time32/fstatat_time32.c
+++ b/compat/time32/fstatat_time32.c
@@ -13,5 +13,3 @@ int __fstatat_time32(int fd, const char *restrict path, struct stat32 *restrict
 	if (!r) memcpy(st32, &st, offsetof(struct stat, st_atim));
 	return r;
 }
-
-weak_alias(fstatat, fstatat64);
diff --git a/compat/time32/lstat_time32.c b/compat/time32/lstat_time32.c
index c1257a14..28cb5a0b 100644
--- a/compat/time32/lstat_time32.c
+++ b/compat/time32/lstat_time32.c
@@ -13,5 +13,3 @@ int __lstat_time32(const char *restrict path, struct stat32 *restrict st32)
 	if (!r) memcpy(st32, &st, offsetof(struct stat, st_atim));
 	return r;
 }
-
-weak_alias(lstat, lstat64);
diff --git a/compat/time32/stat_time32.c b/compat/time32/stat_time32.c
index 8c6121da..b154b0f9 100644
--- a/compat/time32/stat_time32.c
+++ b/compat/time32/stat_time32.c
@@ -13,5 +13,3 @@ int __stat_time32(const char *restrict path, struct stat32 *restrict st32)
 	if (!r) memcpy(st32, &st, offsetof(struct stat, st_atim));
 	return r;
 }
-
-weak_alias(stat, stat64);
diff --git a/configure b/configure
index a2728969..bc9fbe48 100755
--- a/configure
+++ b/configure
@@ -30,11 +30,14 @@ System types:
 Optional features:
   --enable-optimize=...   optimize listed components for speed over size [auto]
   --enable-debug          build with debugging information [disabled]
-  --enable-warnings       build with recommended warnings flags [disabled]
+  --disable-warnings      build with recommended warnings flags [enabled]
   --enable-wrapper=...    build given musl toolchain wrapper [auto]
   --disable-shared        inhibit building shared library [enabled]
   --disable-static        inhibit building static library [enabled]
 
+Optional packages:
+  --with-malloc=...       choose malloc implementation [mallocng]
+
 Some influential environment variables:
   CC                      C compiler command [detected]
   CFLAGS                  C compiler flags [-Os -pipe ...]
@@ -133,12 +136,13 @@ build=
 target=
 optimize=auto
 debug=no
-warnings=no
+warnings=yes
 shared=auto
 static=yes
 wrapper=auto
 gcc_wrapper=no
 clang_wrapper=no
+malloc_dir=mallocng
 
 for arg ; do
 case "$arg" in
@@ -168,6 +172,7 @@ case "$arg" in
 --disable-wrapper|--enable-wrapper=no) wrapper=no ;;
 --enable-gcc-wrapper|--enable-gcc-wrapper=yes) wrapper=yes ; gcc_wrapper=yes ;;
 --disable-gcc-wrapper|--enable-gcc-wrapper=no) wrapper=no ;;
+--with-malloc=*) malloc_dir=${arg#*=} ;;
 --enable-*|--disable-*|--with-*|--without-*|--*dir=*) ;;
 --host=*|--target=*) target=${arg#*=} ;;
 --build=*) build=${arg#*=} ;;
@@ -199,7 +204,7 @@ fi
 abs_builddir="$(pwd)" || fail "$0: cannot determine working directory"
 abs_srcdir="$(cd $srcdir && pwd)" || fail "$0: invalid source directory $srcdir"
 test "$abs_srcdir" = "$abs_builddir" && srcdir=.
-test "$srcdir" != "." -a -f Makefile -a ! -h Makefile && fail "$0: Makefile already exists in the working directory"
+test "$srcdir" != "." && test -f Makefile && test ! -h Makefile && fail "$0: Makefile already exists in the working directory"
 
 #
 # Get a temp filename we can use
@@ -215,6 +220,12 @@ set +C
 trap 'rm "$tmpc"' EXIT INT QUIT TERM HUP
 
 #
+# Check that the requested malloc implementation exists
+#
+test -d "$srcdir/src/malloc/$malloc_dir" \
+|| fail "$0: error: chosen malloc implementation '$malloc_dir' does not exist"
+
+#
 # Check whether we are cross-compiling, and set a default
 # CROSS_COMPILE prefix if none was provided.
 #
@@ -268,7 +279,7 @@ echo "$cc_family"
 #
 # Figure out toolchain wrapper to build
 #
-if test "$wrapper" = auto -o "$wrapper" = detect ; then
+if test "$wrapper" = auto || test "$wrapper" = detect ; then
 echo "#include <stdlib.h>" > "$tmpc"
 echo "#if ! __GLIBC__" >> "$tmpc"
 echo "#error no" >> "$tmpc"
@@ -317,6 +328,7 @@ i?86*) ARCH=i386 ;;
 x86_64-x32*|x32*|x86_64*x32) ARCH=x32 ;;
 x86_64-nt64*) ARCH=nt64 ;;
 x86_64*) ARCH=x86_64 ;;
+loongarch64*) ARCH=loongarch64 ;;
 m68k*) ARCH=m68k ;;
 mips64*|mipsisa64*) ARCH=mips64 ;;
 mips*) ARCH=mips ;;
@@ -325,6 +337,7 @@ or1k*) ARCH=or1k ;;
 powerpc64*|ppc64*) ARCH=powerpc64 ;;
 powerpc*|ppc*) ARCH=powerpc ;;
 riscv64*) ARCH=riscv64 ;;
+riscv32*) ARCH=riscv32 ;;
 sh[1-9bel-]*|sh|superh*) ARCH=sh ;;
 s390x*) ARCH=s390x ;;
 unknown) fail "$0: unable to detect target arch; try $0 --target=..." ;;
@@ -343,6 +356,14 @@ tryflag CFLAGS_C99FSE -fexcess-precision=standard \
 tryflag CFLAGS_C99FSE -frounding-math
 
 #
+# Semantically we want to insist that our sources follow the
+# C rules for type-based aliasing, but most if not all real-world
+# compilers are known or suspected to have critical bugs in their
+# type-based aliasing analysis. See for example GCC bug 107107.
+#
+tryflag CFLAGS_C99FSE -fno-strict-aliasing
+
+#
 # We may use the may_alias attribute if __GNUC__ is defined, so
 # if the compiler defines __GNUC__ but does not provide it,
 # it must be defined away as part of the CFLAGS.
@@ -398,7 +419,7 @@ test "$debug" = yes && CFLAGS_AUTO=-g
 #
 printf "checking whether we should preprocess assembly to add debugging information... "
 if fnmatch '-g*|*\ -g*' "$CFLAGS_AUTO $CFLAGS" &&
-   test -f "tools/add-cfi.$ARCH.awk" &&
+   test -f "$srcdir/tools/add-cfi.$ARCH.awk" &&
    printf ".file 1 \"srcfile.s\"\n.line 1\n.cfi_startproc\n.cfi_endproc" | $CC -g -x assembler -c -o /dev/null 2>/dev/null -
 then
   ADD_CFI=yes
@@ -425,7 +446,20 @@ xno|x) printf "disabled\n" ; optimize=no ;;
 *) printf "custom\n" ;;
 esac
 
-test "$optimize" = no || tryflag CFLAGS_AUTO -Os || tryflag CFLAGS_AUTO -O2
+if test "$optimize" = no ; then :
+else
+tryflag CFLAGS_AUTO -O2
+tryflag CFLAGS_AUTO -fno-align-jumps
+tryflag CFLAGS_AUTO -fno-align-functions
+tryflag CFLAGS_AUTO -fno-align-loops
+tryflag CFLAGS_AUTO -fno-align-labels
+tryflag CFLAGS_AUTO -fira-region=one
+tryflag CFLAGS_AUTO -fira-hoist-pressure
+tryflag CFLAGS_AUTO -freorder-blocks-algorithm=simple \
+|| tryflag CFLAGS_AUTO -fno-reorder-blocks
+tryflag CFLAGS_AUTO -fno-prefetch-loop-arrays
+tryflag CFLAGS_AUTO -fno-tree-ch
+fi
 test "$optimize" = yes && optimize="internal,malloc,string"
 
 if fnmatch 'no|size' "$optimize" ; then :
@@ -457,7 +491,7 @@ tryflag CFLAGS_AUTO -pipe
 # pointer is no longer needed for debugging.
 #
 if fnmatch '-g*|*\ -g*' "$CFLAGS_AUTO $CFLAGS" ; then :
-else 
+else
 tryflag CFLAGS_AUTO -fomit-frame-pointer
 fi
 
@@ -495,6 +529,16 @@ fnmatch '-mtune=*|*\ -mtune=*' "$CC $CFLAGS" || tryldflag CFLAGS_AUTO -mtune=gen
 fi
 
 #
+# GCC defines -w as overriding any -W options, regardless of order, but
+# clang has a bunch of annoying warnings enabled by default and needs -w
+# to start from a clean slate. So use -w if building with clang. Also
+# turn off a common on-by-default cast warning regardless of compiler.
+#
+test "$cc_family" = clang && tryflag CFLAGS_AUTO -w
+
+tryflag CFLAGS_AUTO -Wno-pointer-to-int-cast
+
+#
 # Even with -std=c99, gcc accepts some constructs which are constraint
 # violations. We want to treat these as errors regardless of whether
 # other purely stylistic warnings are enabled -- especially implicit
@@ -504,6 +548,10 @@ tryflag CFLAGS_AUTO -Werror=implicit-function-declaration
 tryflag CFLAGS_AUTO -Werror=implicit-int
 tryflag CFLAGS_AUTO -Werror=pointer-sign
 tryflag CFLAGS_AUTO -Werror=pointer-arith
+tryflag CFLAGS_AUTO -Werror=int-conversion
+tryflag CFLAGS_AUTO -Werror=incompatible-pointer-types
+tryflag CFLAGS_AUTO -Werror=discarded-qualifiers
+tryflag CFLAGS_AUTO -Werror=discarded-array-qualifiers
 
 #
 # GCC ignores unused arguements by default, but Clang needs this extra
@@ -513,14 +561,17 @@ tryflag CFLAGS_AUTO -Werror=pointer-arith
 test "$cc_family" = clang && tryflag CFLAGS_AUTO -Qunused-arguments
 
 if test "x$warnings" = xyes ; then
-tryflag CFLAGS_AUTO -Wall
-tryflag CFLAGS_AUTO -Wno-parentheses
-tryflag CFLAGS_AUTO -Wno-uninitialized
-tryflag CFLAGS_AUTO -Wno-missing-braces
-tryflag CFLAGS_AUTO -Wno-unused-value
-tryflag CFLAGS_AUTO -Wno-unused-but-set-variable
-tryflag CFLAGS_AUTO -Wno-unknown-pragmas
-tryflag CFLAGS_AUTO -Wno-pointer-to-int-cast
+tryflag CFLAGS_AUTO -Waddress
+tryflag CFLAGS_AUTO -Warray-bounds
+tryflag CFLAGS_AUTO -Wchar-subscripts
+tryflag CFLAGS_AUTO -Wduplicate-decl-specifier
+tryflag CFLAGS_AUTO -Winit-self
+tryflag CFLAGS_AUTO -Wreturn-type
+tryflag CFLAGS_AUTO -Wsequence-point
+tryflag CFLAGS_AUTO -Wstrict-aliasing
+tryflag CFLAGS_AUTO -Wunused-function
+tryflag CFLAGS_AUTO -Wunused-label
+tryflag CFLAGS_AUTO -Wunused-variable
 fi
 
 # Determine if the compiler produces position-independent code (PIC)
@@ -622,6 +673,19 @@ if test "$ARCH" = "aarch64" ; then
 trycppif __AARCH64EB__ "$t" && SUBARCH=${SUBARCH}_be
 fi
 
+if test "$ARCH" = "loongarch64" ; then
+trycppif __loongarch_soft_float "$t" && SUBARCH=${SUBARCH}-sf
+trycppif __loongarch_single_float "$t" && SUBARCH=${SUBARCH}-sp
+printf "checking whether assembler support FCSRs... "
+echo "__asm__(\"movfcsr2gr \$t0,\$fcsr0\");" > "$tmpc"
+if $CC -c -o /dev/null "$tmpc" >/dev/null 2>&1 ; then
+printf "yes\n"
+else
+printf "no\n"
+CFLAGS_AUTO="$CFLAGS_AUTO -DBROKEN_LOONGARCH_FCSR_ASM"
+fi
+fi
+
 if test "$ARCH" = "m68k" ; then
 if trycppif "__HAVE_68881__" ; then : ;
 elif trycppif "__mcffpu__" ; then SUBARCH="-fp64"
@@ -643,9 +707,7 @@ trycppif __mips_soft_float "$t" && SUBARCH=${SUBARCH}-sf
 fi
 
 if test "$ARCH" = "powerpc" ; then
-trycppif "__NO_FPRS__ && !_SOFT_FLOAT" "$t" && fail \
-  "$0: error: compiler's floating point configuration is unsupported"
-trycppif _SOFT_FLOAT "$t" && SUBARCH=${SUBARCH}-sf
+trycppif "_SOFT_FLOAT || __NO_FPRS__" "$t" && SUBARCH=${SUBARCH}-sf
 printf "checking whether compiler can use 'd' constraint in asm... "
 echo 'double f(double x) { __asm__ ("fabs %0, %1" : "=d"(x) : "d"(x)); return x; }' > "$tmpc"
 if $CC $CFLAGS_C99FSE $CPPFLAGS $CFLAGS -c -o /dev/null "$tmpc" >/dev/null 2>&1 ; then
@@ -666,7 +728,7 @@ trycppif __LITTLE_ENDIAN__ "$t" && SUBARCH=${SUBARCH}le
 trycppif _SOFT_FLOAT "$t" && fail "$0: error: soft-float not supported on powerpc64"
 fi
 
-if test "$ARCH" = "riscv64" ; then
+if test "$ARCH" = "riscv64" -o "$ARCH" = "riscv32" ; then
 trycppif __riscv_float_abi_soft "$t" && SUBARCH=${SUBARCH}-sf
 trycppif __riscv_float_abi_single "$t" && SUBARCH=${SUBARCH}-sp
 fi
@@ -697,11 +759,6 @@ fi
 test "$SUBARCH" \
 && printf "configured for %s variant: %s\n" "$ARCH" "$ARCH$SUBARCH"
 
-case "$ARCH$SUBARCH" in
-arm) ASMSUBARCH=el ;;
-*) ASMSUBARCH=$SUBARCH ;;
-esac
-
 #
 # Some archs (powerpc) have different possible long double formats
 # that the compiler can be configured for. The logic for whether this
@@ -772,6 +829,7 @@ OPTIMIZE_GLOBS = $OPTIMIZE_GLOBS
 ALL_TOOLS = $tools
 TOOL_LIBS = $tool_libs
 ADD_CFI = $ADD_CFI
+MALLOC_DIR = $malloc_dir
 EOF
 test "x$static" = xno && echo "STATIC_LIBS ="
 test "x$shared" = xno && echo "SHARED_LIBS ="
diff --git a/crt/aarch64/crti.s b/crt/aarch64/crti.s
index 775df0ac..3776fa64 100644
--- a/crt/aarch64/crti.s
+++ b/crt/aarch64/crti.s
@@ -1,6 +1,7 @@
 .section .init
 .global _init
 .type _init,%function
+.align 2
 _init:
 	stp x29,x30,[sp,-16]!
 	mov x29,sp
@@ -8,6 +9,7 @@ _init:
 .section .fini
 .global _fini
 .type _fini,%function
+.align 2
 _fini:
 	stp x29,x30,[sp,-16]!
 	mov x29,sp
diff --git a/crt/arm/crti.s b/crt/arm/crti.s
index 18dc1e41..cccda3ea 100644
--- a/crt/arm/crti.s
+++ b/crt/arm/crti.s
@@ -3,11 +3,13 @@
 .section .init
 .global _init
 .type _init,%function
+.align 2
 _init:
 	push {r0,lr}
 
 .section .fini
 .global _fini
 .type _fini,%function
+.align 2
 _fini:
 	push {r0,lr}
diff --git a/crt/crt1.c b/crt/crt1.c
index 8fe8ab5d..10601215 100644
--- a/crt/crt1.c
+++ b/crt/crt1.c
@@ -11,7 +11,7 @@ weak void _fini();
 int __libc_start_main(int (*)(), int, char **,
 	void (*)(), void(*)(), void(*)());
 
-void _start_c(long *p)
+hidden void _start_c(long *p)
 {
 	int argc = p[0];
 	char **argv = (void *)(p+1);
diff --git a/crt/mips/crtn.s b/crt/mips/crtn.s
index 506a04b7..92eb3d0e 100644
--- a/crt/mips/crtn.s
+++ b/crt/mips/crtn.s
@@ -3,11 +3,11 @@
 .section .init
 	lw $gp,24($sp)
 	lw $ra,28($sp)
-	j $ra
+	jr $ra
 	addu $sp,$sp,32
 
 .section .fini
 	lw $gp,24($sp)
 	lw $ra,28($sp)
-	j $ra
+	jr $ra
 	addu $sp,$sp,32
diff --git a/crt/mips64/crtn.s b/crt/mips64/crtn.s
index f3930b24..8f090ed3 100644
--- a/crt/mips64/crtn.s
+++ b/crt/mips64/crtn.s
@@ -3,11 +3,11 @@
 .section .init
 	ld $gp,16($sp)
 	ld $ra,24($sp)
-	j $ra
+	jr $ra
 	daddu $sp,$sp,32
 
 .section .fini
 	ld $gp,16($sp)
 	ld $ra,24($sp)
-	j $ra
+	jr $ra
 	daddu $sp,$sp,32
diff --git a/crt/mipsn32/crtn.s b/crt/mipsn32/crtn.s
index dccd7e89..0679eac3 100644
--- a/crt/mipsn32/crtn.s
+++ b/crt/mipsn32/crtn.s
@@ -2,11 +2,11 @@
 .section	.init
 	ld	$gp, 16($sp)
 	ld	$ra, 24($sp)
-	j	$ra
+	jr	$ra
 	addu	$sp, $sp, 32
 
 .section	.fini
 	ld	$gp, 16($sp)
 	ld	$ra, 24($sp)
-	j	$ra
+	jr	$ra
 	addu	$sp, $sp, 32
diff --git a/include/aio.h b/include/aio.h
index 453c41b7..a938fcad 100644
--- a/include/aio.h
+++ b/include/aio.h
@@ -49,7 +49,7 @@ int aio_fsync(int, struct aiocb *);
 
 int lio_listio(int, struct aiocb *__restrict const *__restrict, int, struct sigevent *__restrict);
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define aiocb64 aiocb
 #define aio_read64 aio_read
 #define aio_write64 aio_write
diff --git a/include/alltypes.h.in b/include/alltypes.h.in
index d9ff462e..d47aeea9 100644
--- a/include/alltypes.h.in
+++ b/include/alltypes.h.in
@@ -77,6 +77,8 @@ TYPEDEF struct __sigset_t { unsigned long __bits[128/sizeof(long)]; } sigset_t;
 
 STRUCT iovec { void *iov_base; size_t iov_len; };
 
+STRUCT winsize { unsigned short ws_row, ws_col, ws_xpixel, ws_ypixel; };
+
 TYPEDEF unsigned socklen_t;
 TYPEDEF unsigned short sa_family_t;
 
diff --git a/include/arpa/inet.h b/include/arpa/inet.h
index 37f8c11e..9d20a15b 100644
--- a/include/arpa/inet.h
+++ b/include/arpa/inet.h
@@ -24,11 +24,6 @@ struct in_addr inet_makeaddr(in_addr_t, in_addr_t);
 in_addr_t inet_lnaof(struct in_addr);
 in_addr_t inet_netof(struct in_addr);
 
-#undef INET_ADDRSTRLEN
-#undef INET6_ADDRSTRLEN
-#define INET_ADDRSTRLEN  16
-#define INET6_ADDRSTRLEN 46
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/arpa/nameser.h b/include/arpa/nameser.h
index 581925a4..9c1327a1 100644
--- a/include/arpa/nameser.h
+++ b/include/arpa/nameser.h
@@ -188,6 +188,36 @@ typedef enum __ns_type {
 	ns_t_sink = 40,
 	ns_t_opt = 41,
 	ns_t_apl = 42,
+	ns_t_ds = 43,
+	ns_t_sshfp = 44,
+	ns_t_ipseckey = 45,
+	ns_t_rrsig = 46,
+	ns_t_nsec = 47,
+	ns_t_dnskey = 48,
+	ns_t_dhcid = 49,
+	ns_t_nsec3 = 50,
+	ns_t_nsec3param = 51,
+	ns_t_tlsa = 52,
+	ns_t_smimea = 53,
+	ns_t_hip = 55,
+	ns_t_ninfo = 56,
+	ns_t_rkey = 57,
+	ns_t_talink = 58,
+	ns_t_cds = 59,
+	ns_t_cdnskey = 60,
+	ns_t_openpgpkey = 61,
+	ns_t_csync = 62,
+	ns_t_spf = 99,
+	ns_t_uinfo = 100,
+	ns_t_uid = 101,
+	ns_t_gid = 102,
+	ns_t_unspec = 103,
+	ns_t_nid = 104,
+	ns_t_l32 = 105,
+	ns_t_l64 = 106,
+	ns_t_lp = 107,
+	ns_t_eui48 = 108,
+	ns_t_eui64 = 109,
 	ns_t_tkey = 249,
 	ns_t_tsig = 250,
 	ns_t_ixfr = 251,
@@ -196,6 +226,11 @@ typedef enum __ns_type {
 	ns_t_maila = 254,
 	ns_t_any = 255,
 	ns_t_zxfr = 256,
+	ns_t_uri = 256,
+	ns_t_caa = 257,
+	ns_t_avc = 258,
+	ns_t_ta = 32768,
+	ns_t_dlv = 32769,
 	ns_t_max = 65536
 } ns_type;
 
@@ -430,12 +465,48 @@ typedef struct {
 #define T_NAPTR		ns_t_naptr
 #define T_A6		ns_t_a6
 #define T_DNAME		ns_t_dname
+#define T_DS		ns_t_ds
+#define T_SSHFP		ns_t_sshfp
+#define T_IPSECKEY	ns_t_ipseckey
+#define T_RRSIG		ns_t_rrsig
+#define T_NSEC		ns_t_nsec
+#define T_DNSKEY	ns_t_dnskey
+#define T_DHCID		ns_t_dhcid
+#define T_NSEC3		ns_t_nsec3
+#define T_NSEC3PARAM	ns_t_nsec3param
+#define T_TLSA		ns_t_tlsa
+#define T_SMIMEA	ns_t_smimea
+#define T_HIP		ns_t_hip
+#define T_NINFO		ns_t_ninfo
+#define T_RKEY		ns_t_rkey
+#define T_TALINK	ns_t_talink
+#define T_CDS		ns_t_cds
+#define T_CDNSKEY	ns_t_cdnskey
+#define T_OPENPGPKEY	ns_t_openpgpkey
+#define T_CSYNC		ns_t_csync
+#define T_SPF		ns_t_spf
+#define T_UINFO		ns_t_uinfo
+#define T_UID		ns_t_uid
+#define T_GID		ns_t_gid
+#define T_UNSPEC	ns_t_unspec
+#define T_NID		ns_t_nid
+#define T_L32		ns_t_l32
+#define T_L64		ns_t_l64
+#define T_LP		ns_t_lp
+#define T_EUI48		ns_t_eui48
+#define T_EUI64		ns_t_eui64
+#define T_TKEY		ns_t_tkey
 #define	T_TSIG		ns_t_tsig
 #define	T_IXFR		ns_t_ixfr
 #define T_AXFR		ns_t_axfr
 #define T_MAILB		ns_t_mailb
 #define T_MAILA		ns_t_maila
 #define T_ANY		ns_t_any
+#define T_URI		ns_t_uri
+#define T_CAA		ns_t_caa
+#define T_AVC		ns_t_avc
+#define T_TA		ns_t_ta
+#define T_DLV		ns_t_dlv
 
 #define C_IN		ns_c_in
 #define C_CHAOS		ns_c_chaos
diff --git a/include/ctype.h b/include/ctype.h
index 7936536f..32bcef4d 100644
--- a/include/ctype.h
+++ b/include/ctype.h
@@ -64,7 +64,9 @@ int   isascii(int);
 int   toascii(int);
 #define _tolower(a) ((a)|0x20)
 #define _toupper(a) ((a)&0x5f)
+#ifndef __cplusplus
 #define isascii(a) (0 ? isascii(a) : (unsigned)(a) < 128)
+#endif
 
 #endif
 
diff --git a/include/dirent.h b/include/dirent.h
index 650ecf64..7fa60e06 100644
--- a/include/dirent.h
+++ b/include/dirent.h
@@ -9,14 +9,23 @@ extern "C" {
 
 #define __NEED_ino_t
 #define __NEED_off_t
-#if defined(_BSD_SOURCE) || defined(_GNU_SOURCE)
 #define __NEED_size_t
-#endif
+#define __NEED_ssize_t
 
 #include <bits/alltypes.h>
 
 #include <bits/dirent.h>
 
+typedef unsigned short reclen_t;
+
+struct posix_dent {
+	ino_t d_ino;
+	off_t d_off;
+	reclen_t d_reclen;
+	unsigned char d_type;
+	char d_name[];
+};
+
 typedef struct __dirstream DIR;
 
 #define d_fileno d_ino
@@ -29,6 +38,8 @@ int            readdir_r(DIR *__restrict, struct dirent *__restrict, struct dire
 void           rewinddir(DIR *);
 int            dirfd(DIR *);
 
+ssize_t posix_getdents(int, void *, size_t, int);
+
 int alphasort(const struct dirent **, const struct dirent **);
 int scandir(const char *, struct dirent ***, int (*)(const struct dirent *), int (*)(const struct dirent **, const struct dirent **));
 
@@ -37,7 +48,6 @@ void           seekdir(DIR *, long);
 long           telldir(DIR *);
 #endif
 
-#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
 #define DT_UNKNOWN 0
 #define DT_FIFO 1
 #define DT_CHR 2
@@ -47,6 +57,8 @@ long           telldir(DIR *);
 #define DT_LNK 10
 #define DT_SOCK 12
 #define DT_WHT 14
+
+#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
 #define IFTODT(x) ((x)>>12 & 017)
 #define DTTOIF(x) ((x)<<12)
 int getdents(int, struct dirent *, size_t);
@@ -56,7 +68,7 @@ int getdents(int, struct dirent *, size_t);
 int versionsort(const struct dirent **, const struct dirent **);
 #endif
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define dirent64 dirent
 #define readdir64 readdir
 #define readdir64_r readdir_r
diff --git a/include/elf.h b/include/elf.h
index 549f92c1..8b622f63 100644
--- a/include/elf.h
+++ b/include/elf.h
@@ -315,7 +315,8 @@ typedef struct {
 #define EM_RISCV	243
 #define EM_BPF		247
 #define EM_CSKY		252
-#define EM_NUM		253
+#define EM_LOONGARCH	258
+#define EM_NUM		259
 
 #define EM_ALPHA	0x9026
 
@@ -385,7 +386,8 @@ typedef struct {
 #define SHT_PREINIT_ARRAY 16
 #define SHT_GROUP	  17
 #define SHT_SYMTAB_SHNDX  18
-#define	SHT_NUM		  19
+#define SHT_RELR	  19
+#define	SHT_NUM		  20
 #define SHT_LOOS	  0x60000000
 #define SHT_GNU_ATTRIBUTES 0x6ffffff5
 #define SHT_GNU_HASH	  0x6ffffff6
@@ -436,6 +438,7 @@ typedef struct {
 } Elf64_Chdr;
 
 #define ELFCOMPRESS_ZLIB	1
+#define ELFCOMPRESS_ZSTD	2
 #define ELFCOMPRESS_LOOS	0x60000000
 #define ELFCOMPRESS_HIOS	0x6fffffff
 #define ELFCOMPRESS_LOPROC	0x70000000
@@ -556,6 +559,11 @@ typedef struct {
 
 
 
+typedef Elf32_Word Elf32_Relr;
+typedef Elf64_Xword Elf64_Relr;
+
+
+
 #define ELF32_R_SYM(val)		((val) >> 8)
 #define ELF32_R_TYPE(val)		((val) & 0xff)
 #define ELF32_R_INFO(sym, type)		(((sym) << 8) + ((type) & 0xff))
@@ -603,6 +611,7 @@ typedef struct {
 #define PT_GNU_EH_FRAME	0x6474e550
 #define PT_GNU_STACK	0x6474e551
 #define PT_GNU_RELRO	0x6474e552
+#define PT_GNU_PROPERTY	0x6474e553
 #define PT_LOSUNW	0x6ffffffa
 #define PT_SUNWBSS	0x6ffffffa
 #define PT_SUNWSTACK	0x6ffffffb
@@ -685,6 +694,8 @@ typedef struct {
 #define NT_ARM_PAC_MASK	0x406
 #define NT_ARM_PACA_KEYS	0x407
 #define NT_ARM_PACG_KEYS	0x408
+#define NT_ARM_TAGGED_ADDR_CTRL	0x409
+#define NT_ARM_PAC_ENABLED_KEYS	0x40a
 #define NT_METAG_CBUF	0x500
 #define NT_METAG_RPIPE	0x501
 #define NT_METAG_TLS	0x502
@@ -693,7 +704,14 @@ typedef struct {
 #define NT_MIPS_DSP	0x800
 #define NT_MIPS_FP_MODE	0x801
 #define NT_MIPS_MSA	0x802
+#define NT_RISCV_CSR	0x900
+#define NT_RISCV_VECTOR	0x901
 #define NT_VERSION	1
+#define NT_LOONGARCH_CPUCFG	0xa00
+#define NT_LOONGARCH_CSR	0xa01
+#define NT_LOONGARCH_LSX	0xa02
+#define NT_LOONGARCH_LASX	0xa03
+#define NT_LOONGARCH_LBT	0xa04
 
 
 
@@ -751,7 +769,10 @@ typedef struct {
 #define DT_PREINIT_ARRAY 32
 #define DT_PREINIT_ARRAYSZ 33
 #define DT_SYMTAB_SHNDX	34
-#define	DT_NUM		35
+#define DT_RELRSZ	35
+#define DT_RELR		36
+#define DT_RELRENT	37
+#define	DT_NUM		38
 #define DT_LOOS		0x6000000d
 #define DT_HIOS		0x6ffff000
 #define DT_LOPROC	0x70000000
@@ -1085,6 +1106,7 @@ typedef struct {
 
 #define NT_GNU_BUILD_ID	3
 #define NT_GNU_GOLD_VERSION	4
+#define NT_GNU_PROPERTY_TYPE_0	5
 
 
 
@@ -3240,6 +3262,7 @@ enum
 #define R_RISCV_TLS_DTPREL64    9
 #define R_RISCV_TLS_TPREL32     10
 #define R_RISCV_TLS_TPREL64     11
+#define R_RISCV_TLSDESC         12
 
 #define R_RISCV_BRANCH          16
 #define R_RISCV_JAL             17
@@ -3266,16 +3289,11 @@ enum
 #define R_RISCV_SUB16           38
 #define R_RISCV_SUB32           39
 #define R_RISCV_SUB64           40
-#define R_RISCV_GNU_VTINHERIT   41
-#define R_RISCV_GNU_VTENTRY     42
+#define R_RISCV_GOT32_PCREL     41
 #define R_RISCV_ALIGN           43
 #define R_RISCV_RVC_BRANCH      44
 #define R_RISCV_RVC_JUMP        45
 #define R_RISCV_RVC_LUI         46
-#define R_RISCV_GPREL_I         47
-#define R_RISCV_GPREL_S         48
-#define R_RISCV_TPREL_I         49
-#define R_RISCV_TPREL_S         50
 #define R_RISCV_RELAX           51
 #define R_RISCV_SUB6            52
 #define R_RISCV_SET6            53
@@ -3283,6 +3301,111 @@ enum
 #define R_RISCV_SET16           55
 #define R_RISCV_SET32           56
 #define R_RISCV_32_PCREL        57
+#define R_RISCV_IRELATIVE       58
+#define R_RISCV_PLT32           59
+#define R_RISCV_SET_ULEB128     60
+#define R_RISCV_SUB_ULEB128     61
+#define R_RISCV_TLSDESC_HI20    62
+#define R_RISCV_TLSDESC_LOAD_LO12 63
+#define R_RISCV_TLSDESC_ADD_LO12  64
+#define R_RISCV_TLSDESC_CALL    65
+
+#define EF_LARCH_ABI_MODIFIER_MASK    0x07
+#define EF_LARCH_ABI_SOFT_FLOAT       0x01
+#define EF_LARCH_ABI_SINGLE_FLOAT     0x02
+#define EF_LARCH_ABI_DOUBLE_FLOAT     0x03
+#define EF_LARCH_OBJABI_V1            0x40
+
+#define R_LARCH_NONE                        0
+#define R_LARCH_32                          1
+#define R_LARCH_64                          2
+#define R_LARCH_RELATIVE                    3
+#define R_LARCH_COPY                        4
+#define R_LARCH_JUMP_SLOT                   5
+#define R_LARCH_TLS_DTPMOD32                6
+#define R_LARCH_TLS_DTPMOD64                7
+#define R_LARCH_TLS_DTPREL32                8
+#define R_LARCH_TLS_DTPREL64                9
+#define R_LARCH_TLS_TPREL32                 10
+#define R_LARCH_TLS_TPREL64                 11
+#define R_LARCH_IRELATIVE                   12
+#define R_LARCH_TLS_DESC64                  14
+#define R_LARCH_MARK_LA                     20
+#define R_LARCH_MARK_PCREL                  21
+#define R_LARCH_SOP_PUSH_PCREL              22
+#define R_LARCH_SOP_PUSH_ABSOLUTE           23
+#define R_LARCH_SOP_PUSH_DUP                24
+#define R_LARCH_SOP_PUSH_GPREL              25
+#define R_LARCH_SOP_PUSH_TLS_TPREL          26
+#define R_LARCH_SOP_PUSH_TLS_GOT            27
+#define R_LARCH_SOP_PUSH_TLS_GD             28
+#define R_LARCH_SOP_PUSH_PLT_PCREL          29
+#define R_LARCH_SOP_ASSERT                  30
+#define R_LARCH_SOP_NOT                     31
+#define R_LARCH_SOP_SUB                     32
+#define R_LARCH_SOP_SL                      33
+#define R_LARCH_SOP_SR                      34
+#define R_LARCH_SOP_ADD                     35
+#define R_LARCH_SOP_AND                     36
+#define R_LARCH_SOP_IF_ELSE                 37
+#define R_LARCH_SOP_POP_32_S_10_5           38
+#define R_LARCH_SOP_POP_32_U_10_12          39
+#define R_LARCH_SOP_POP_32_S_10_12          40
+#define R_LARCH_SOP_POP_32_S_10_16          41
+#define R_LARCH_SOP_POP_32_S_10_16_S2       42
+#define R_LARCH_SOP_POP_32_S_5_20           43
+#define R_LARCH_SOP_POP_32_S_0_5_10_16_S2   44
+#define R_LARCH_SOP_POP_32_S_0_10_10_16_S2  45
+#define R_LARCH_SOP_POP_32_U                46
+#define R_LARCH_ADD8                        47
+#define R_LARCH_ADD16                       48
+#define R_LARCH_ADD24                       49
+#define R_LARCH_ADD32                       50
+#define R_LARCH_ADD64                       51
+#define R_LARCH_SUB8                        52
+#define R_LARCH_SUB16                       53
+#define R_LARCH_SUB24                       54
+#define R_LARCH_SUB32                       55
+#define R_LARCH_SUB64                       56
+#define R_LARCH_GNU_VTINHERIT               57
+#define R_LARCH_GNU_VTENTRY                 58
+#define R_LARCH_B16                         64
+#define R_LARCH_B21                         65
+#define R_LARCH_B26                         66
+#define R_LARCH_ABS_HI20                    67
+#define R_LARCH_ABS_LO12                    68
+#define R_LARCH_ABS64_LO20                  69
+#define R_LARCH_ABS64_HI12                  70
+#define R_LARCH_PCALA_HI20                  71
+#define R_LARCH_PCALA_LO12                  72
+#define R_LARCH_PCALA64_LO20                73
+#define R_LARCH_PCALA64_HI12                74
+#define R_LARCH_GOT_PC_HI20                 75
+#define R_LARCH_GOT_PC_LO12                 76
+#define R_LARCH_GOT64_PC_LO20               77
+#define R_LARCH_GOT64_PC_HI12               78
+#define R_LARCH_GOT_HI20                    79
+#define R_LARCH_GOT_LO12                    80
+#define R_LARCH_GOT64_LO20                  81
+#define R_LARCH_GOT64_HI12                  82
+#define R_LARCH_TLS_LE_HI20                 83
+#define R_LARCH_TLS_LE_LO12                 84
+#define R_LARCH_TLS_LE64_LO20               85
+#define R_LARCH_TLS_LE64_HI12               86
+#define R_LARCH_TLS_IE_PC_HI20              87
+#define R_LARCH_TLS_IE_PC_LO12              88
+#define R_LARCH_TLS_IE64_PC_LO20            89
+#define R_LARCH_TLS_IE64_PC_HI12            90
+#define R_LARCH_TLS_IE_HI20                 91
+#define R_LARCH_TLS_IE_LO12                 92
+#define R_LARCH_TLS_IE64_LO20               93
+#define R_LARCH_TLS_IE64_HI12               94
+#define R_LARCH_TLS_LD_PC_HI20              95
+#define R_LARCH_TLS_LD_HI20                 96
+#define R_LARCH_TLS_GD_PC_HI20              97
+#define R_LARCH_TLS_GD_HI20                 98
+#define R_LARCH_32_PCREL                    99
+#define R_LARCH_RELAX                       100
 
 #ifdef __cplusplus
 }
diff --git a/include/fcntl.h b/include/fcntl.h
index b664cdc4..53f98a8b 100644
--- a/include/fcntl.h
+++ b/include/fcntl.h
@@ -184,7 +184,6 @@ struct f_owner_ex {
 #define SPLICE_F_MORE 4
 #define SPLICE_F_GIFT 8
 int fallocate(int, int, off_t, off_t);
-#define fallocate64 fallocate
 int name_to_handle_at(int, const char *, struct file_handle *, int *, int);
 int open_by_handle_at(int, struct file_handle *, int);
 ssize_t readahead(int, off_t, size_t);
@@ -195,7 +194,7 @@ ssize_t tee(int, int, size_t, unsigned);
 #define loff_t off_t
 #endif
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define F_GETLK64 F_GETLK
 #define F_SETLK64 F_SETLK
 #define F_SETLKW64 F_SETLKW
@@ -207,6 +206,9 @@ ssize_t tee(int, int, size_t, unsigned);
 #define posix_fadvise64 posix_fadvise
 #define posix_fallocate64 posix_fallocate
 #define off64_t off_t
+#if defined(_GNU_SOURCE)
+#define fallocate64 fallocate
+#endif
 #endif
 
 #ifdef __cplusplus
diff --git a/include/ftw.h b/include/ftw.h
index b15c062a..d0445e8a 100644
--- a/include/ftw.h
+++ b/include/ftw.h
@@ -29,7 +29,7 @@ struct FTW {
 int ftw(const char *, int (*)(const char *, const struct stat *, int), int);
 int nftw(const char *, int (*)(const char *, const struct stat *, int, struct FTW *), int, int);
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define ftw64 ftw
 #define nftw64 nftw
 #endif
diff --git a/include/glob.h b/include/glob.h
index 4a562a20..fed06745 100644
--- a/include/glob.h
+++ b/include/glob.h
@@ -39,7 +39,7 @@ void globfree(glob_t *);
 #define GLOB_NOMATCH 3
 #define GLOB_NOSYS   4
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define glob64 glob
 #define globfree64 globfree
 #define glob64_t glob_t
diff --git a/include/locale.h b/include/locale.h
index ce384381..11106fea 100644
--- a/include/locale.h
+++ b/include/locale.h
@@ -7,7 +7,9 @@ extern "C" {
 
 #include <features.h>
 
-#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#elif defined(__cplusplus)
 #define NULL 0L
 #else
 #define NULL ((void*)0)
diff --git a/include/netdb.h b/include/netdb.h
index d096c781..3af065e2 100644
--- a/include/netdb.h
+++ b/include/netdb.h
@@ -44,6 +44,7 @@ struct addrinfo {
 #define EAI_NONAME     -2
 #define EAI_AGAIN      -3
 #define EAI_FAIL       -4
+#define EAI_NODATA     -5
 #define EAI_FAMILY     -6
 #define EAI_SOCKTYPE   -7
 #define EAI_SERVICE    -8
diff --git a/include/netinet/if_ether.h b/include/netinet/if_ether.h
index a08485e7..3479f511 100644
--- a/include/netinet/if_ether.h
+++ b/include/netinet/if_ether.h
@@ -59,12 +59,14 @@
 #define ETH_P_PREAUTH	0x88C7
 #define ETH_P_TIPC	0x88CA
 #define ETH_P_LLDP	0x88CC
+#define ETH_P_MRP	0x88E3
 #define ETH_P_MACSEC	0x88E5
 #define ETH_P_8021AH	0x88E7
 #define ETH_P_MVRP	0x88F5
 #define ETH_P_1588	0x88F7
 #define ETH_P_NCSI	0x88F8
 #define ETH_P_PRP	0x88FB
+#define ETH_P_CFM	0x8902
 #define ETH_P_FCOE	0x8906
 #define ETH_P_TDLS	0x890D
 #define ETH_P_FIP	0x8914
diff --git a/include/netinet/in.h b/include/netinet/in.h
index 5b8b21e6..fb628b61 100644
--- a/include/netinet/in.h
+++ b/include/netinet/in.h
@@ -48,6 +48,7 @@ struct ipv6_mreq {
 #define INADDR_BROADCAST  ((in_addr_t) 0xffffffff)
 #define INADDR_NONE       ((in_addr_t) 0xffffffff)
 #define INADDR_LOOPBACK   ((in_addr_t) 0x7f000001)
+#define INADDR_DUMMY      ((in_addr_t) 0xc0000008)
 
 #define INADDR_UNSPEC_GROUP     ((in_addr_t) 0xe0000000)
 #define INADDR_ALLHOSTS_GROUP   ((in_addr_t) 0xe0000001)
@@ -60,8 +61,6 @@ struct ipv6_mreq {
 
 extern const struct in6_addr in6addr_any, in6addr_loopback;
 
-#undef INET_ADDRSTRLEN
-#undef INET6_ADDRSTRLEN
 #define INET_ADDRSTRLEN  16
 #define INET6_ADDRSTRLEN 46
 
@@ -103,8 +102,10 @@ uint16_t ntohs(uint16_t);
 #define IPPROTO_MH       135
 #define IPPROTO_UDPLITE  136
 #define IPPROTO_MPLS     137
+#define IPPROTO_ETHERNET 143
 #define IPPROTO_RAW      255
-#define IPPROTO_MAX      256
+#define IPPROTO_MPTCP    262
+#define IPPROTO_MAX      263
 
 #define IN6_IS_ADDR_UNSPECIFIED(a) \
         (((uint32_t *) (a))[0] == 0 && ((uint32_t *) (a))[1] == 0 && \
@@ -202,6 +203,7 @@ uint16_t ntohs(uint16_t);
 #define IP_CHECKSUM        23
 #define IP_BIND_ADDRESS_NO_PORT 24
 #define IP_RECVFRAGSIZE    25
+#define IP_RECVERR_RFC4884 26
 #define IP_MULTICAST_IF    32
 #define IP_MULTICAST_TTL   33
 #define IP_MULTICAST_LOOP  34
diff --git a/include/netinet/tcp.h b/include/netinet/tcp.h
index 44a007aa..fad1d844 100644
--- a/include/netinet/tcp.h
+++ b/include/netinet/tcp.h
@@ -78,6 +78,10 @@ enum {
 	TCP_NLA_DSACK_DUPS,
 	TCP_NLA_REORD_SEEN,
 	TCP_NLA_SRTT,
+	TCP_NLA_TIMEOUT_REHASH,
+	TCP_NLA_BYTES_NOTSENT,
+	TCP_NLA_EDT,
+	TCP_NLA_TTL,
 };
 
 #if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
@@ -181,6 +185,13 @@ struct tcphdr {
 #define TCP_CA_Recovery		3
 #define TCP_CA_Loss		4
 
+enum tcp_fastopen_client_fail {
+	TFO_STATUS_UNSPEC,
+	TFO_COOKIE_UNAVAILABLE,
+	TFO_DATA_NOT_ACKED,
+	TFO_SYN_RETRANSMITTED,
+};
+
 struct tcp_info {
 	uint8_t tcpi_state;
 	uint8_t tcpi_ca_state;
@@ -189,7 +200,7 @@ struct tcp_info {
 	uint8_t tcpi_backoff;
 	uint8_t tcpi_options;
 	uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
-	uint8_t tcpi_delivery_rate_app_limited : 1;
+	uint8_t tcpi_delivery_rate_app_limited : 1, tcpi_fastopen_client_fail : 2;
 	uint32_t tcpi_rto;
 	uint32_t tcpi_ato;
 	uint32_t tcpi_snd_mss;
@@ -240,14 +251,15 @@ struct tcp_info {
 
 #define TCP_MD5SIG_MAXKEYLEN    80
 
-#define TCP_MD5SIG_FLAG_PREFIX  1
+#define TCP_MD5SIG_FLAG_PREFIX  0x1
+#define TCP_MD5SIG_FLAG_IFINDEX 0x2
 
 struct tcp_md5sig {
 	struct sockaddr_storage tcpm_addr;
 	uint8_t tcpm_flags;
 	uint8_t tcpm_prefixlen;
 	uint16_t tcpm_keylen;
-	uint32_t __tcpm_pad;
+	int tcpm_ifindex;
 	uint8_t tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 };
 
@@ -271,10 +283,21 @@ struct tcp_repair_window {
 	uint32_t rcv_wup;
 };
 
+#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
+
 struct tcp_zerocopy_receive {
 	uint64_t address;
 	uint32_t length;
 	uint32_t recv_skip_hint;
+	uint32_t inq;
+	int32_t err;
+	uint64_t copybuf_address;
+	int32_t copybuf_len;
+	uint32_t flags;
+	uint64_t msg_control;
+	uint64_t msg_controllen;
+	uint32_t msg_flags;
+	uint32_t reserved;
 };
 
 #endif
diff --git a/include/netinet/udp.h b/include/netinet/udp.h
index ffd89079..40c3f203 100644
--- a/include/netinet/udp.h
+++ b/include/netinet/udp.h
@@ -35,6 +35,7 @@ struct udphdr {
 #define UDP_ENCAP_GTP0		4
 #define UDP_ENCAP_GTP1U		5
 #define UDP_ENCAP_RXRPC		6
+#define TCP_ENCAP_ESPINTCP	7
 
 #define SOL_UDP            17
 
diff --git a/include/poll.h b/include/poll.h
index 472e4b84..272dc34a 100644
--- a/include/poll.h
+++ b/include/poll.h
@@ -36,7 +36,7 @@ struct pollfd {
 
 int poll (struct pollfd *, nfds_t, int);
 
-#ifdef _GNU_SOURCE
+#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
 #define __NEED_time_t
 #define __NEED_struct_timespec
 #define __NEED_sigset_t
@@ -45,7 +45,7 @@ int ppoll(struct pollfd *, nfds_t, const struct timespec *, const sigset_t *);
 #endif
 
 #if _REDIR_TIME64
-#ifdef _GNU_SOURCE
+#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
 __REDIR(ppoll, __ppoll_time64);
 #endif
 #endif
diff --git a/include/pthread.h b/include/pthread.h
index 984db680..89fd9ff7 100644
--- a/include/pthread.h
+++ b/include/pthread.h
@@ -74,6 +74,9 @@ extern "C" {
 #define PTHREAD_BARRIER_SERIAL_THREAD (-1)
 
 
+#define PTHREAD_NULL ((pthread_t)0)
+
+
 int pthread_create(pthread_t *__restrict, const pthread_attr_t *__restrict, void *(*)(void *), void *__restrict);
 int pthread_detach(pthread_t);
 _Noreturn void pthread_exit(void *);
@@ -218,6 +221,7 @@ int pthread_getaffinity_np(pthread_t, size_t, struct cpu_set_t *);
 int pthread_setaffinity_np(pthread_t, size_t, const struct cpu_set_t *);
 int pthread_getattr_np(pthread_t, pthread_attr_t *);
 int pthread_setname_np(pthread_t, const char *);
+int pthread_getname_np(pthread_t, char *, size_t);
 int pthread_getattr_default_np(pthread_attr_t *);
 int pthread_setattr_default_np(const pthread_attr_t *);
 int pthread_tryjoin_np(pthread_t, void **);
diff --git a/include/sched.h b/include/sched.h
index 822f464e..8c3b53f0 100644
--- a/include/sched.h
+++ b/include/sched.h
@@ -49,6 +49,7 @@ int     sched_yield(void);
 
 #ifdef _GNU_SOURCE
 #define CSIGNAL		0x000000ff
+#define CLONE_NEWTIME	0x00000080
 #define CLONE_VM	0x00000100
 #define CLONE_FS	0x00000200
 #define CLONE_FILES	0x00000400
@@ -77,11 +78,10 @@ int clone (int (*)(void *), void *, int, void *, ...);
 int unshare(int);
 int setns(int, int);
 
-void *memcpy(void *__restrict, const void *__restrict, size_t);
-int memcmp(const void *, const void *, size_t);
-void *memset (void *, int, size_t);
-void *calloc(size_t, size_t);
-void free(void *);
+int (memcmp)(const void *, const void *, size_t);
+void *(memset)(void *, int, size_t);
+void *(calloc)(size_t, size_t);
+void (free)(void *);
 
 typedef struct cpu_set_t { unsigned long __bits[128/sizeof(long)]; } cpu_set_t;
 int __sched_cpucount(size_t, const cpu_set_t *);
@@ -115,15 +115,15 @@ __CPU_op_func_S(XOR, ^)
 #define CPU_XOR_S(a,b,c,d) __CPU_XOR_S(a,b,c,d)
 
 #define CPU_COUNT_S(size,set) __sched_cpucount(size,set)
-#define CPU_ZERO_S(size,set) memset(set,0,size)
-#define CPU_EQUAL_S(size,set1,set2) (!memcmp(set1,set2,size))
+#define CPU_ZERO_S(size,set) (memset)(set,0,size)
+#define CPU_EQUAL_S(size,set1,set2) (!(memcmp)(set1,set2,size))
 
 #define CPU_ALLOC_SIZE(n) (sizeof(long) * ( (n)/(8*sizeof(long)) \
 	+ ((n)%(8*sizeof(long)) + 8*sizeof(long)-1)/(8*sizeof(long)) ) )
-#define CPU_ALLOC(n) ((cpu_set_t *)calloc(1,CPU_ALLOC_SIZE(n)))
-#define CPU_FREE(set) free(set)
+#define CPU_ALLOC(n) ((cpu_set_t *)(calloc)(1,CPU_ALLOC_SIZE(n)))
+#define CPU_FREE(set) (free)(set)
 
-#define CPU_SETSIZE 128
+#define CPU_SETSIZE 1024
 
 #define CPU_SET(i, set) CPU_SET_S(i,sizeof(cpu_set_t),set)
 #define CPU_CLR(i, set) CPU_CLR_S(i,sizeof(cpu_set_t),set)
diff --git a/include/setjmp.h b/include/setjmp.h
index 2d43abf8..1976af23 100644
--- a/include/setjmp.h
+++ b/include/setjmp.h
@@ -15,25 +15,33 @@ typedef struct __jmp_buf_tag {
 	unsigned long __ss[128/sizeof(long)];
 } jmp_buf[1];
 
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+#define __setjmp_attr __attribute__((__returns_twice__))
+#else
+#define __setjmp_attr
+#endif
+
 #if defined(_POSIX_SOURCE) || defined(_POSIX_C_SOURCE) \
  || defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) \
  || defined(_BSD_SOURCE)
 typedef jmp_buf sigjmp_buf;
-int sigsetjmp (sigjmp_buf, int);
+int sigsetjmp (sigjmp_buf, int) __setjmp_attr;
 _Noreturn void siglongjmp (sigjmp_buf, int);
 #endif
 
 #if defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) \
  || defined(_BSD_SOURCE)
-int _setjmp (jmp_buf);
+int _setjmp (jmp_buf) __setjmp_attr;
 _Noreturn void _longjmp (jmp_buf, int);
 #endif
 
-int setjmp (jmp_buf);
+int setjmp (jmp_buf) __setjmp_attr;
 _Noreturn void longjmp (jmp_buf, int);
 
 #define setjmp setjmp
 
+#undef __setjmp_attr
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/shadow.h b/include/shadow.h
index 2b1be413..a9d6940f 100644
--- a/include/shadow.h
+++ b/include/shadow.h
@@ -28,7 +28,6 @@ void setspent(void);
 void endspent(void);
 struct spwd *getspent(void);
 struct spwd *fgetspent(FILE *);
-struct spwd *sgetspent(const char *);
 int putspent(const struct spwd *, FILE *);
 
 struct spwd *getspnam(const char *);
diff --git a/include/signal.h b/include/signal.h
index fbdf667b..c347f861 100644
--- a/include/signal.h
+++ b/include/signal.h
@@ -75,6 +75,8 @@ typedef struct sigaltstack stack_t;
 #define SEGV_ACCERR 2
 #define SEGV_BNDERR 3
 #define SEGV_PKUERR 4
+#define SEGV_MTEAERR 8
+#define SEGV_MTESERR 9
 
 #define BUS_ADRALN 1
 #define BUS_ADRERR 2
@@ -176,18 +178,31 @@ struct sigaction {
 #define sa_handler   __sa_handler.sa_handler
 #define sa_sigaction __sa_handler.sa_sigaction
 
+#define SA_UNSUPPORTED 0x00000400
+#define SA_EXPOSE_TAGBITS 0x00000800
+
 struct sigevent {
 	union sigval sigev_value;
 	int sigev_signo;
 	int sigev_notify;
-	void (*sigev_notify_function)(union sigval);
-	pthread_attr_t *sigev_notify_attributes;
-	char __pad[56-3*sizeof(long)];
+	union {
+		char __pad[64 - 2*sizeof(int) - sizeof(union sigval)];
+		pid_t sigev_notify_thread_id;
+		struct {
+			void (*sigev_notify_function)(union sigval);
+			pthread_attr_t *sigev_notify_attributes;
+		} __sev_thread;
+	} __sev_fields;
 };
 
+#define sigev_notify_thread_id __sev_fields.sigev_notify_thread_id
+#define sigev_notify_function __sev_fields.__sev_thread.sigev_notify_function
+#define sigev_notify_attributes __sev_fields.__sev_thread.sigev_notify_attributes
+
 #define SIGEV_SIGNAL 0
 #define SIGEV_NONE 1
 #define SIGEV_THREAD 2
+#define SIGEV_THREAD_ID 4
 
 int __libc_current_sigrtmin(void);
 int __libc_current_sigrtmax(void);
@@ -249,6 +264,9 @@ void (*sigset(int, void (*)(int)))(int);
 #if defined(_BSD_SOURCE) || defined(_GNU_SOURCE)
 #define NSIG _NSIG
 typedef void (*sig_t)(int);
+
+#define SYS_SECCOMP 1
+#define SYS_USER_DISPATCH 2
 #endif
 
 #ifdef _GNU_SOURCE
diff --git a/include/stdc-predef.h b/include/stdc-predef.h
index f8cd4b89..642bad2d 100644
--- a/include/stdc-predef.h
+++ b/include/stdc-predef.h
@@ -7,4 +7,12 @@
 #define __STDC_IEC_559__ 1
 #endif
 
+#if !defined(__STDC_UTF_16__)
+#define __STDC_UTF_16__ 1
+#endif
+
+#if !defined(__STDC_UTF_32__)
+#define __STDC_UTF_32__ 1
+#endif
+
 #endif
diff --git a/include/stddef.h b/include/stddef.h
index bd753853..f25b8639 100644
--- a/include/stddef.h
+++ b/include/stddef.h
@@ -1,7 +1,9 @@
 #ifndef _STDDEF_H
 #define _STDDEF_H
 
-#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#elif defined(__cplusplus)
 #define NULL 0L
 #else
 #define NULL ((void*)0)
diff --git a/include/stdio.h b/include/stdio.h
index 3604198c..4ea4c170 100644
--- a/include/stdio.h
+++ b/include/stdio.h
@@ -25,7 +25,9 @@ extern "C" {
 
 #include <bits/alltypes.h>
 
-#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#elif defined(__cplusplus)
 #define NULL 0L
 #else
 #define NULL ((void*)0)
@@ -156,6 +158,13 @@ char *ctermid(char *);
 #define L_ctermid 20
 #endif
 
+#if defined(_GNU_SOURCE)
+#define RENAME_NOREPLACE (1 << 0)
+#define RENAME_EXCHANGE  (1 << 1)
+#define RENAME_WHITEOUT  (1 << 2)
+
+int renameat2(int, const char *, int, const char *, unsigned);
+#endif
 
 #if defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) \
  || defined(_BSD_SOURCE)
@@ -203,7 +212,7 @@ typedef struct _IO_cookie_io_functions_t {
 FILE *fopencookie(void *, const char *, cookie_io_functions_t);
 #endif
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define tmpfile64 tmpfile
 #define fopen64 fopen
 #define freopen64 freopen
diff --git a/include/stdlib.h b/include/stdlib.h
index 194c2033..475190bf 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -7,7 +7,9 @@ extern "C" {
 
 #include <features.h>
 
-#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#elif defined(__cplusplus)
 #define NULL 0L
 #else
 #define NULL ((void*)0)
@@ -93,7 +95,7 @@ size_t __ctype_get_mb_cur_max(void);
 #define WTERMSIG(s) ((s) & 0x7f)
 #define WSTOPSIG(s) WEXITSTATUS(s)
 #define WIFEXITED(s) (!WTERMSIG(s))
-#define WIFSTOPPED(s) ((short)((((s)&0xffff)*0x10001)>>8) > 0x7f00)
+#define WIFSTOPPED(s) ((short)((((s)&0xffff)*0x10001U)>>8) > 0x7f00)
 #define WIFSIGNALED(s) (((s)&0xffff)-1U < 0xffu)
 
 int posix_memalign (void **, size_t, size_t);
@@ -145,6 +147,8 @@ int getloadavg(double *, int);
 int clearenv(void);
 #define WCOREDUMP(s) ((s) & 0x80)
 #define WIFCONTINUED(s) ((s) == 0xffff)
+void *reallocarray (void *, size_t, size_t);
+void qsort_r (void *, size_t, size_t, int (*)(const void *, const void *, void *), void *);
 #endif
 
 #ifdef _GNU_SOURCE
@@ -159,7 +163,7 @@ double strtod_l(const char *__restrict, char **__restrict, struct __locale_struc
 long double strtold_l(const char *__restrict, char **__restrict, struct __locale_struct *);
 #endif
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define mkstemp64 mkstemp
 #define mkostemp64 mkostemp
 #if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
diff --git a/include/string.h b/include/string.h
index 795a2abc..83e2b946 100644
--- a/include/string.h
+++ b/include/string.h
@@ -7,7 +7,9 @@ extern "C" {
 
 #include <features.h>
 
-#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#elif defined(__cplusplus)
 #define NULL 0L
 #else
 #define NULL ((void*)0)
@@ -71,6 +73,7 @@ char *strsignal(int);
 char *strerror_l (int, locale_t);
 int strcoll_l (const char *, const char *, locale_t);
 size_t strxfrm_l (char *__restrict, const char *__restrict, size_t, locale_t);
+void *memmem(const void *, size_t, const void *, size_t);
 #endif
 
 #if defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) \
@@ -90,12 +93,8 @@ void explicit_bzero (void *, size_t);
 int strverscmp (const char *, const char *);
 char *strchrnul(const char *, int);
 char *strcasestr(const char *, const char *);
-void *memmem(const void *, size_t, const void *, size_t);
 void *memrchr(const void *, int, size_t);
 void *mempcpy(void *, const void *, size_t);
-#ifndef __cplusplus
-char *basename();
-#endif
 #endif
 
 #ifdef __cplusplus
diff --git a/include/strings.h b/include/strings.h
index db0960b4..b7a5ea08 100644
--- a/include/strings.h
+++ b/include/strings.h
@@ -5,6 +5,7 @@
 extern "C" {
 #endif
 
+#include <features.h>
 
 #define __NEED_size_t
 #define __NEED_locale_t
diff --git a/include/sys/epoll.h b/include/sys/epoll.h
index ac81a841..5f975c4a 100644
--- a/include/sys/epoll.h
+++ b/include/sys/epoll.h
@@ -7,6 +7,7 @@ extern "C" {
 
 #include <stdint.h>
 #include <sys/types.h>
+#include <sys/ioctl.h>
 #include <fcntl.h>
 
 #define __NEED_sigset_t
@@ -54,6 +55,17 @@ __attribute__ ((__packed__))
 #endif
 ;
 
+struct epoll_params {
+	uint32_t busy_poll_usecs;
+	uint16_t busy_poll_budget;
+	uint8_t prefer_busy_poll;
+
+	uint8_t __pad;
+};
+
+#define EPOLL_IOC_TYPE 0x8A
+#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params)
+#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params)
 
 int epoll_create(int);
 int epoll_create1(int);
diff --git a/include/sys/fanotify.h b/include/sys/fanotify.h
index b637c8f5..10e5f15e 100644
--- a/include/sys/fanotify.h
+++ b/include/sys/fanotify.h
@@ -55,8 +55,9 @@ struct fanotify_response {
 #define FAN_OPEN_PERM 0x10000
 #define FAN_ACCESS_PERM 0x20000
 #define FAN_OPEN_EXEC_PERM 0x40000
-#define FAN_ONDIR 0x40000000
+#define FAN_DIR_MODIFY 0x00080000
 #define FAN_EVENT_ON_CHILD 0x08000000
+#define FAN_ONDIR 0x40000000
 #define FAN_CLOSE (FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE)
 #define FAN_MOVE (FAN_MOVED_FROM | FAN_MOVED_TO)
 #define FAN_CLOEXEC 0x01
@@ -70,6 +71,9 @@ struct fanotify_response {
 #define FAN_ENABLE_AUDIT 0x40
 #define FAN_REPORT_TID 0x100
 #define FAN_REPORT_FID 0x200
+#define FAN_REPORT_DIR_FID 0x00000400
+#define FAN_REPORT_NAME 0x00000800
+#define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME)
 #define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
 #define FAN_MARK_ADD 0x01
 #define FAN_MARK_REMOVE 0x02
@@ -88,6 +92,8 @@ struct fanotify_response {
 #define FAN_ALL_OUTGOING_EVENTS (FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_Q_OVERFLOW)
 #define FANOTIFY_METADATA_VERSION 3
 #define FAN_EVENT_INFO_TYPE_FID 1
+#define FAN_EVENT_INFO_TYPE_DFID_NAME 2
+#define FAN_EVENT_INFO_TYPE_DFID 3
 #define FAN_ALLOW 0x01
 #define FAN_DENY 0x02
 #define FAN_AUDIT 0x10
diff --git a/include/sys/ioctl.h b/include/sys/ioctl.h
index c2ce3b48..a9a2346e 100644
--- a/include/sys/ioctl.h
+++ b/include/sys/ioctl.h
@@ -4,6 +4,8 @@
 extern "C" {
 #endif
 
+#define __NEED_struct_winsize
+
 #include <bits/alltypes.h>
 #include <bits/ioctl.h>
 
@@ -47,13 +49,6 @@ extern "C" {
 
 #define TIOCSER_TEMT 1
 
-struct winsize {
-	unsigned short ws_row;
-	unsigned short ws_col;
-	unsigned short ws_xpixel;
-	unsigned short ws_ypixel;
-};
-
 #define SIOCADDRT          0x890B
 #define SIOCDELRT          0x890C
 #define SIOCRTMSG          0x890D
diff --git a/include/sys/membarrier.h b/include/sys/membarrier.h
index 10cb3108..11193eda 100644
--- a/include/sys/membarrier.h
+++ b/include/sys/membarrier.h
@@ -9,9 +9,13 @@
 #define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 16
 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE 32
 #define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE 64
+#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ 128
+#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ 256
 
 #define MEMBARRIER_CMD_SHARED MEMBARRIER_CMD_GLOBAL
 
+#define MEMBARRIER_CMD_FLAG_CPU 1
+
 int membarrier(int, int);
 
 #endif
diff --git a/include/sys/mman.h b/include/sys/mman.h
index 3bade727..3d5d0f9c 100644
--- a/include/sys/mman.h
+++ b/include/sys/mman.h
@@ -40,6 +40,7 @@ extern "C" {
 
 #define MAP_HUGE_SHIFT 26
 #define MAP_HUGE_MASK  0x3f
+#define MAP_HUGE_16KB  (14 << 26)
 #define MAP_HUGE_64KB  (16 << 26)
 #define MAP_HUGE_512KB (19 << 26)
 #define MAP_HUGE_1MB   (20 << 26)
@@ -101,6 +102,7 @@ extern "C" {
 #ifdef _GNU_SOURCE
 #define MREMAP_MAYMOVE 1
 #define MREMAP_FIXED 2
+#define MREMAP_DONTUNMAP 4
 
 #define MLOCK_ONFAULT 0x01
 
@@ -139,7 +141,7 @@ int mincore (void *, size_t, unsigned char *);
 int shm_open (const char *, int, mode_t);
 int shm_unlink (const char *);
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define mmap64 mmap
 #define off64_t off_t
 #endif
diff --git a/include/sys/mount.h b/include/sys/mount.h
index 57a89c09..09bd6e9d 100644
--- a/include/sys/mount.h
+++ b/include/sys/mount.h
@@ -31,6 +31,7 @@ extern "C" {
 #define MS_REMOUNT     32
 #define MS_MANDLOCK    64
 #define MS_DIRSYNC     128
+#define MS_NOSYMFOLLOW 256
 #define MS_NOATIME     1024
 #define MS_NODIRATIME  2048
 #define MS_BIND        4096
diff --git a/include/sys/personality.h b/include/sys/personality.h
index 31d43dfe..411dc475 100644
--- a/include/sys/personality.h
+++ b/include/sys/personality.h
@@ -5,7 +5,9 @@
 extern "C" {
 #endif
 
+#define UNAME26            0x0020000
 #define ADDR_NO_RANDOMIZE  0x0040000
+#define FDPIC_FUNCPTRS     0x0080000
 #define MMAP_PAGE_ZERO     0x0100000
 #define ADDR_COMPAT_LAYOUT 0x0200000
 #define READ_IMPLIES_EXEC  0x0400000
@@ -17,6 +19,7 @@ extern "C" {
 
 #define PER_LINUX 0
 #define PER_LINUX_32BIT ADDR_LIMIT_32BIT
+#define PER_LINUX_FDPIC FDPIC_FUNCPTRS
 #define PER_SVR4 (1 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO)
 #define PER_SVR3 (2 | STICKY_TIMEOUTS | SHORT_INODE)
 #define PER_SCOSVR3 (3 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE)
diff --git a/include/sys/prctl.h b/include/sys/prctl.h
index d9c846e9..087a75c9 100644
--- a/include/sys/prctl.h
+++ b/include/sys/prctl.h
@@ -157,6 +157,25 @@ struct prctl_mm_map {
 #define PR_SET_TAGGED_ADDR_CTRL 55
 #define PR_GET_TAGGED_ADDR_CTRL 56
 #define PR_TAGGED_ADDR_ENABLE (1UL << 0)
+#define PR_MTE_TCF_SHIFT 1
+#define PR_MTE_TCF_NONE  (0UL << 1)
+#define PR_MTE_TCF_SYNC  (1UL << 1)
+#define PR_MTE_TCF_ASYNC (2UL << 1)
+#define PR_MTE_TCF_MASK  (3UL << 1)
+#define PR_MTE_TAG_SHIFT 3
+#define PR_MTE_TAG_MASK  (0xffffUL << 3)
+
+#define PR_SET_IO_FLUSHER 57
+#define PR_GET_IO_FLUSHER 58
+
+#define PR_SET_SYSCALL_USER_DISPATCH 59
+#define PR_SYS_DISPATCH_OFF 0
+#define PR_SYS_DISPATCH_ON 1
+#define SYSCALL_DISPATCH_FILTER_ALLOW 0
+#define SYSCALL_DISPATCH_FILTER_BLOCK 1
+
+#define PR_PAC_SET_ENABLED_KEYS 60
+#define PR_PAC_GET_ENABLED_KEYS 61
 
 int prctl (int, ...);
 
diff --git a/include/sys/ptrace.h b/include/sys/ptrace.h
index 5d62a985..c72e3c06 100644
--- a/include/sys/ptrace.h
+++ b/include/sys/ptrace.h
@@ -42,6 +42,7 @@ extern "C" {
 #define PTRACE_SECCOMP_GET_FILTER 0x420c
 #define PTRACE_SECCOMP_GET_METADATA 0x420d
 #define PTRACE_GET_SYSCALL_INFO 0x420e
+#define PTRACE_GET_RSEQ_CONFIGURATION	0x420f
 
 #define PT_READ_I PTRACE_PEEKTEXT
 #define PT_READ_D PTRACE_PEEKDATA
@@ -130,6 +131,14 @@ struct __ptrace_syscall_info {
 	};
 };
 
+struct __ptrace_rseq_configuration {
+	uint64_t rseq_abi_pointer;
+	uint32_t rseq_abi_size;
+	uint32_t signature;
+	uint32_t flags;
+	uint32_t pad;
+};
+
 long ptrace(int, ...);
 
 #ifdef __cplusplus
diff --git a/include/sys/random.h b/include/sys/random.h
index 4ee7bf2c..59e40ab8 100644
--- a/include/sys/random.h
+++ b/include/sys/random.h
@@ -10,6 +10,7 @@ extern "C" {
 
 #define GRND_NONBLOCK	0x0001
 #define GRND_RANDOM	0x0002
+#define GRND_INSECURE	0x0004
 
 ssize_t getrandom(void *, size_t, unsigned);
 
diff --git a/include/sys/reg.h b/include/sys/reg.h
index b47452d0..0272e137 100644
--- a/include/sys/reg.h
+++ b/include/sys/reg.h
@@ -4,6 +4,15 @@
 #include <limits.h>
 #include <unistd.h>
 
+#include <bits/alltypes.h>
+
+#undef __WORDSIZE
+#if __LONG_MAX == 0x7fffffffL
+#define __WORDSIZE 32
+#else
+#define __WORDSIZE 64
+#endif
+
 #include <bits/reg.h>
 
 #endif
diff --git a/include/sys/resource.h b/include/sys/resource.h
index 3068328d..e8bfbe1f 100644
--- a/include/sys/resource.h
+++ b/include/sys/resource.h
@@ -95,7 +95,7 @@ int prlimit(pid_t, int, const struct rlimit *, struct rlimit *);
 
 #define RLIM_NLIMITS RLIMIT_NLIMITS
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define RLIM64_INFINITY RLIM_INFINITY
 #define RLIM64_SAVED_CUR RLIM_SAVED_CUR
 #define RLIM64_SAVED_MAX RLIM_SAVED_MAX
diff --git a/include/sys/sendfile.h b/include/sys/sendfile.h
index e7570d8e..253a041b 100644
--- a/include/sys/sendfile.h
+++ b/include/sys/sendfile.h
@@ -10,7 +10,7 @@ extern "C" {
 
 ssize_t sendfile(int, int, off_t *, size_t);
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define sendfile64 sendfile
 #define off64_t off_t
 #endif
diff --git a/include/sys/socket.h b/include/sys/socket.h
index 38f5bb17..6dc1e40a 100644
--- a/include/sys/socket.h
+++ b/include/sys/socket.h
@@ -289,6 +289,8 @@ struct linger {
 #define SCM_TXTIME              SO_TXTIME
 #define SO_BINDTOIFINDEX        62
 #define SO_DETACH_REUSEPORT_BPF 68
+#define SO_PREFER_BUSY_POLL     69
+#define SO_BUSY_POLL_BUDGET     70
 
 #ifndef SOL_SOCKET
 #define SOL_SOCKET      1
diff --git a/include/sys/stat.h b/include/sys/stat.h
index 10d446c4..c924ce2f 100644
--- a/include/sys/stat.h
+++ b/include/sys/stat.h
@@ -18,6 +18,13 @@ extern "C" {
 #define __NEED_blkcnt_t
 #define __NEED_struct_timespec
 
+#ifdef _GNU_SOURCE
+#define __NEED_int64_t
+#define __NEED_uint64_t
+#define __NEED_uint32_t
+#define __NEED_uint16_t
+#endif
+
 #include <bits/alltypes.h>
 
 #include <bits/stat.h>
@@ -98,7 +105,72 @@ int lchmod(const char *, mode_t);
 #define S_IEXEC S_IXUSR
 #endif
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_GNU_SOURCE)
+#define STATX_TYPE 1U
+#define STATX_MODE 2U
+#define STATX_NLINK 4U
+#define STATX_UID 8U
+#define STATX_GID 0x10U
+#define STATX_ATIME 0x20U
+#define STATX_MTIME 0x40U
+#define STATX_CTIME 0x80U
+#define STATX_INO 0x100U
+#define STATX_SIZE 0x200U
+#define STATX_BLOCKS 0x400U
+#define STATX_BASIC_STATS 0x7ffU
+#define STATX_BTIME 0x800U
+#define STATX_ALL 0xfffU
+#define STATX_MNT_ID 0x1000U
+#define STATX_DIOALIGN 0x2000U
+#define STATX_MNT_ID_UNIQUE 0x4000U
+
+#define STATX_ATTR_COMPRESSED 0x4
+#define STATX_ATTR_IMMUTABLE 0x10
+#define STATX_ATTR_APPEND 0x20
+#define STATX_ATTR_NODUMP 0x40
+#define STATX_ATTR_ENCRYPTED 0x800
+#define STATX_ATTR_AUTOMOUNT 0x1000
+#define STATX_ATTR_MOUNT_ROOT 0x2000
+#define STATX_ATTR_VERITY 0x100000
+#define STATX_ATTR_DAX 0x200000
+
+struct statx_timestamp {
+	int64_t tv_sec;
+	uint32_t tv_nsec, __pad;
+};
+
+struct statx {
+	uint32_t stx_mask;
+	uint32_t stx_blksize;
+	uint64_t stx_attributes;
+	uint32_t stx_nlink;
+	uint32_t stx_uid;
+	uint32_t stx_gid;
+	uint16_t stx_mode;
+	uint16_t __pad0[1];
+	uint64_t stx_ino;
+	uint64_t stx_size;
+	uint64_t stx_blocks;
+	uint64_t stx_attributes_mask;
+	struct statx_timestamp stx_atime;
+	struct statx_timestamp stx_btime;
+	struct statx_timestamp stx_ctime;
+	struct statx_timestamp stx_mtime;
+	uint32_t stx_rdev_major;
+	uint32_t stx_rdev_minor;
+	uint32_t stx_dev_major;
+	uint32_t stx_dev_minor;
+	uint64_t stx_mnt_id;
+	uint32_t stx_dio_mem_align;
+	uint32_t stx_dio_offset_align;
+	uint64_t stx_subvol;
+	uint64_t __pad1[11];
+};
+
+int statx(int, const char *__restrict, int, unsigned, struct statx *__restrict);
+#endif
+
+#if defined(_LARGEFILE64_SOURCE)
 #define stat64 stat
 #define fstat64 fstat
 #define lstat64 lstat
diff --git a/include/sys/statfs.h b/include/sys/statfs.h
index 6f4c6230..7a2e11cd 100644
--- a/include/sys/statfs.h
+++ b/include/sys/statfs.h
@@ -18,7 +18,7 @@ typedef struct __fsid_t {
 int statfs (const char *, struct statfs *);
 int fstatfs (int, struct statfs *);
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define statfs64 statfs
 #define fstatfs64 fstatfs
 #define fsblkcnt64_t fsblkcnt_t
diff --git a/include/sys/statvfs.h b/include/sys/statvfs.h
index 793490b6..71d9d1f9 100644
--- a/include/sys/statvfs.h
+++ b/include/sys/statvfs.h
@@ -23,7 +23,8 @@ struct statvfs {
 	unsigned long f_fsid;
 #endif
 	unsigned long f_flag, f_namemax;
-	int __reserved[6];
+	unsigned int f_type;
+	int __reserved[5];
 };
 
 int statvfs (const char *__restrict, struct statvfs *__restrict);
@@ -42,7 +43,7 @@ int fstatvfs (int, struct statvfs *);
 #define ST_NODIRATIME  2048
 #define ST_RELATIME    4096
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define statvfs64 statvfs
 #define fstatvfs64 fstatvfs
 #define fsblkcnt64_t fsblkcnt_t
diff --git a/include/sys/types.h b/include/sys/types.h
index 0c35541d..3363374f 100644
--- a/include/sys/types.h
+++ b/include/sys/types.h
@@ -71,7 +71,7 @@ typedef unsigned long long u_quad_t;
 #include <sys/select.h>
 #endif
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define blkcnt64_t blkcnt_t
 #define fsblkcnt64_t fsblkcnt_t
 #define fsfilcnt64_t fsfilcnt_t
diff --git a/include/sys/uio.h b/include/sys/uio.h
index 00f73a2f..5e99c7fa 100644
--- a/include/sys/uio.h
+++ b/include/sys/uio.h
@@ -29,7 +29,7 @@ ssize_t writev (int, const struct iovec *, int);
 #if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
 ssize_t preadv (int, const struct iovec *, int, off_t);
 ssize_t pwritev (int, const struct iovec *, int, off_t);
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define preadv64 preadv
 #define pwritev64 pwritev
 #define off64_t off_t
@@ -39,6 +39,14 @@ ssize_t pwritev (int, const struct iovec *, int, off_t);
 #ifdef _GNU_SOURCE
 ssize_t process_vm_writev(pid_t, const struct iovec *, unsigned long, const struct iovec *, unsigned long, unsigned long);
 ssize_t process_vm_readv(pid_t, const struct iovec *, unsigned long, const struct iovec *, unsigned long, unsigned long);
+ssize_t preadv2 (int, const struct iovec *, int, off_t, int);
+ssize_t pwritev2 (int, const struct iovec *, int, off_t, int);
+#define RWF_HIPRI 0x00000001
+#define RWF_DSYNC 0x00000002
+#define RWF_SYNC 0x00000004
+#define RWF_NOWAIT 0x00000008
+#define RWF_APPEND 0x00000010
+#define RWF_NOAPPEND 0x00000020
 #endif
 
 #ifdef __cplusplus
diff --git a/include/sys/user.h b/include/sys/user.h
index 96a03400..511caba3 100644
--- a/include/sys/user.h
+++ b/include/sys/user.h
@@ -8,6 +8,15 @@ extern "C" {
 #include <stdint.h>
 #include <unistd.h>
 
+#include <bits/alltypes.h>
+
+#undef __WORDSIZE
+#if __LONG_MAX == 0x7fffffffL
+#define __WORDSIZE 32
+#else
+#define __WORDSIZE 64
+#endif
+
 #include <bits/user.h>
 
 #ifdef __cplusplus
diff --git a/include/sys/wait.h b/include/sys/wait.h
index d4b1f2e1..8ced671b 100644
--- a/include/sys/wait.h
+++ b/include/sys/wait.h
@@ -50,7 +50,7 @@ pid_t wait4 (pid_t, int *, int, struct rusage *);
 #define WSTOPSIG(s) WEXITSTATUS(s)
 #define WCOREDUMP(s) ((s) & 0x80)
 #define WIFEXITED(s) (!WTERMSIG(s))
-#define WIFSTOPPED(s) ((short)((((s)&0xffff)*0x10001)>>8) > 0x7f00)
+#define WIFSTOPPED(s) ((short)((((s)&0xffff)*0x10001U)>>8) > 0x7f00)
 #define WIFSIGNALED(s) (((s)&0xffff)-1U < 0xffu)
 #define WIFCONTINUED(s) ((s) == 0xffff)
 
diff --git a/include/syslog.h b/include/syslog.h
index 5b4d2964..57599e07 100644
--- a/include/syslog.h
+++ b/include/syslog.h
@@ -18,7 +18,7 @@ extern "C" {
 
 #define LOG_PRIMASK 7
 #define LOG_PRI(p) ((p)&LOG_PRIMASK)
-#define	LOG_MAKEPRI(f, p) (((f)<<3)|(p))
+#define	LOG_MAKEPRI(f, p) ((f)|(p))
 
 #define LOG_MASK(p) (1<<(p))
 #define LOG_UPTO(p) ((1<<((p)+1))-1)
diff --git a/include/termios.h b/include/termios.h
index d73c780d..cbb53301 100644
--- a/include/termios.h
+++ b/include/termios.h
@@ -8,6 +8,7 @@ extern "C" {
 #include <features.h>
 
 #define __NEED_pid_t
+#define __NEED_struct_winsize
 
 #include <bits/alltypes.h>
 
@@ -27,6 +28,9 @@ int cfsetispeed (struct termios *, speed_t);
 int tcgetattr (int, struct termios *);
 int tcsetattr (int, int, const struct termios *);
 
+int tcgetwinsize (int, struct winsize *);
+int tcsetwinsize (int, const struct winsize *);
+
 int tcsendbreak (int, int);
 int tcdrain (int);
 int tcflush (int, int);
diff --git a/include/time.h b/include/time.h
index 5494df18..3d948372 100644
--- a/include/time.h
+++ b/include/time.h
@@ -7,7 +7,9 @@ extern "C" {
 
 #include <features.h>
 
-#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#elif defined(__cplusplus)
 #define NULL 0L
 #else
 #define NULL ((void*)0)
diff --git a/include/unistd.h b/include/unistd.h
index 7bcbff94..42b0e82b 100644
--- a/include/unistd.h
+++ b/include/unistd.h
@@ -14,8 +14,12 @@ extern "C" {
 #define SEEK_SET 0
 #define SEEK_CUR 1
 #define SEEK_END 2
+#define SEEK_DATA 3
+#define SEEK_HOLE 4
 
-#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#elif defined(__cplusplus)
 #define NULL 0L
 #else
 #define NULL ((void*)0)
@@ -82,6 +86,7 @@ unsigned sleep(unsigned);
 int pause(void);
 
 pid_t fork(void);
+pid_t _Fork(void);
 int execve(const char *, char *const [], char *const []);
 int execv(const char *, char *const []);
 int execle(const char *, const char *, ...);
@@ -190,9 +195,10 @@ int syncfs(int);
 int euidaccess(const char *, int);
 int eaccess(const char *, int);
 ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned);
+pid_t gettid(void);
 #endif
 
-#if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#if defined(_LARGEFILE64_SOURCE)
 #define lseek64 lseek
 #define pread64 pread
 #define pwrite64 pwrite
@@ -251,7 +257,13 @@ ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned);
 
 #define _POSIX2_C_BIND          _POSIX_VERSION
 
-#include <bits/posix.h>
+#if __LONG_MAX == 0x7fffffffL
+#define _POSIX_V6_ILP32_OFFBIG  1
+#define _POSIX_V7_ILP32_OFFBIG  1
+#else
+#define _POSIX_V6_LP64_OFF64  1
+#define _POSIX_V7_LP64_OFF64  1
+#endif
 
 
 
@@ -419,6 +431,8 @@ ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned);
 #define _SC_XOPEN_STREAMS	246
 #define _SC_THREAD_ROBUST_PRIO_INHERIT	247
 #define _SC_THREAD_ROBUST_PRIO_PROTECT	248
+#define _SC_MINSIGSTKSZ	249
+#define _SC_SIGSTKSZ	250
 
 #define _CS_PATH	0
 #define _CS_POSIX_V6_WIDTH_RESTRICTED_ENVS	1
@@ -461,6 +475,8 @@ ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned);
 #define _CS_POSIX_V7_LPBIG_OFFBIG_LINTFLAGS	1147
 #define _CS_V6_ENV	1148
 #define _CS_V7_ENV	1149
+#define _CS_POSIX_V7_THREADS_CFLAGS	1150
+#define _CS_POSIX_V7_THREADS_LDFLAGS	1151
 
 #ifdef __cplusplus
 }
diff --git a/include/wchar.h b/include/wchar.h
index 88eb55b1..ed5d774d 100644
--- a/include/wchar.h
+++ b/include/wchar.h
@@ -38,7 +38,9 @@ extern "C" {
 #define WCHAR_MIN (-1-0x7fffffff+L'\0')
 #endif
 
-#ifdef __cplusplus
+#if __cplusplus >= 201103L
+#define NULL nullptr
+#elif defined(__cplusplus)
 #define NULL 0L
 #else
 #define NULL ((void*)0)
diff --git a/ldso/dlstart.c b/ldso/dlstart.c
index 20d50f2c..259f5e18 100644
--- a/ldso/dlstart.c
+++ b/ldso/dlstart.c
@@ -140,6 +140,21 @@ hidden void _dlstart_c(size_t *sp, size_t *dynv)
 		size_t *rel_addr = (void *)(base + rel[0]);
 		*rel_addr = base + rel[2];
 	}
+
+	rel = (void *)(base+dyn[DT_RELR]);
+	rel_size = dyn[DT_RELRSZ];
+	size_t *relr_addr = 0;
+	for (; rel_size; rel++, rel_size-=sizeof(size_t)) {
+		if ((rel[0]&1) == 0) {
+			relr_addr = (void *)(base + rel[0]);
+			*relr_addr++ += base;
+		} else {
+			for (size_t i=0, bitmap=rel[0]; bitmap>>=1; i++)
+				if (bitmap&1)
+					relr_addr[i] += base;
+			relr_addr += 8*sizeof(size_t)-1;
+		}
+	}
 #endif
 
 	stage2_func dls2;
diff --git a/ldso/dynlink.c b/ldso/dynlink.c
index afec985a..715948f4 100644
--- a/ldso/dynlink.c
+++ b/ldso/dynlink.c
@@ -1,6 +1,5 @@
 #define _GNU_SOURCE
 #define SYSCALL_NO_TLS 1
-#include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stddef.h>
@@ -21,11 +20,26 @@
 #include <semaphore.h>
 #include <sys/membarrier.h>
 #include "pthread_impl.h"
+#include "fork_impl.h"
 #include "libc.h"
 #include "dynlink.h"
-#include "malloc_impl.h"
 
-static void error(const char *, ...);
+static size_t ldso_page_size;
+/* libc.h may have defined a macro for dynamic PAGE_SIZE already, but
+ * PAGESIZE is only defined if it's constant for the arch. */
+#ifndef PAGESIZE
+#undef PAGE_SIZE
+#define PAGE_SIZE ldso_page_size
+#endif
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc __libc_realloc
+#define free __libc_free
+
+static void error_impl(const char *, ...);
+static void error_noop(const char *, ...);
+static void (*error)(const char *, ...) = error_noop;
 
 #define MAXP2(a,b) (-(-(a)&-(b)))
 #define ALIGN(x,y) ((x)+(y)-1 & -(y))
@@ -79,7 +93,7 @@ struct dso {
 	struct dso **deps, *needed_by;
 	size_t ndeps_direct;
 	size_t next_dep;
-	int ctor_visitor;
+	pthread_t ctor_visitor;
 	char *rpath_orig, *rpath;
 	struct tls_module tls;
 	size_t tls_id;
@@ -145,6 +159,8 @@ static struct fdpic_dummy_loadmap app_dummy_loadmap;
 
 struct debug *_dl_debug_addr = &debug;
 
+extern weak hidden char __ehdr_start[];
+
 extern hidden int __malloc_replaced;
 
 hidden void (*const __init_array_start)(void)=0, (*const __fini_array_start)(void)=0;
@@ -204,7 +220,8 @@ static void decode_vec(size_t *v, size_t *a, size_t cnt)
 	size_t i;
 	for (i=0; i<cnt; i++) a[i] = 0;
 	for (; v[0]; v+=2) if (v[0]-1<cnt-1) {
-		a[0] |= 1UL<<v[0];
+		if (v[0] < 8*sizeof(long))
+			a[0] |= 1UL<<v[0];
 		a[v[0]] = v[1];
 	}
 }
@@ -330,6 +347,35 @@ static struct symdef find_sym(struct dso *dso, const char *s, int need_def)
 	return find_sym2(dso, s, need_def, 0);
 }
 
+static struct symdef get_lfs64(const char *name)
+{
+	const char *p;
+	static const char lfs64_list[] =
+		"aio_cancel\0aio_error\0aio_fsync\0aio_read\0aio_return\0"
+		"aio_suspend\0aio_write\0alphasort\0creat\0fallocate\0"
+		"fgetpos\0fopen\0freopen\0fseeko\0fsetpos\0fstat\0"
+		"fstatat\0fstatfs\0fstatvfs\0ftello\0ftruncate\0ftw\0"
+		"getdents\0getrlimit\0glob\0globfree\0lio_listio\0"
+		"lockf\0lseek\0lstat\0mkostemp\0mkostemps\0mkstemp\0"
+		"mkstemps\0mmap\0nftw\0open\0openat\0posix_fadvise\0"
+		"posix_fallocate\0pread\0preadv\0prlimit\0pwrite\0"
+		"pwritev\0readdir\0scandir\0sendfile\0setrlimit\0"
+		"stat\0statfs\0statvfs\0tmpfile\0truncate\0versionsort\0"
+		"__fxstat\0__fxstatat\0__lxstat\0__xstat\0";
+	if (!strcmp(name, "readdir64_r"))
+		return find_sym(&ldso, "readdir_r", 1);
+	size_t l = strnlen(name, 18);
+	if (l<2 || name[l-2]!='6' || name[l-1]!='4' || name[l])
+		goto nomatch;
+	for (p=lfs64_list; *p; p++) {
+		if (!strncmp(name, p, l-2) && !p[l-2])
+			return find_sym(&ldso, p, 1);
+		while (*p) p++;
+	}
+nomatch:
+	return (struct symdef){ 0 };
+}
+
 static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stride)
 {
 	unsigned char *base = dso->base;
@@ -383,6 +429,7 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
 			def = (sym->st_info>>4) == STB_LOCAL
 				? (struct symdef){ .dso = dso, .sym = sym }
 				: find_sym(ctx, name, type==REL_PLT);
+			if (!def.sym) def = get_lfs64(name);
 			if (!def.sym && (sym->st_shndx != SHN_UNDEF
 			    || sym->st_info>>4 != STB_WEAK)) {
 				if (dso->lazy && (type==REL_PLT || type==REL_GOT)) {
@@ -415,8 +462,6 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
 		}
 
 		switch(type) {
-		case REL_NONE:
-			break;
 		case REL_OFFSET:
 			addend -= (size_t)reloc_addr;
 		case REL_SYMBOLIC:
@@ -469,7 +514,7 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
 			break;
 #endif
 		case REL_TLSDESC:
-			if (stride<3) addend = reloc_addr[1];
+			if (stride<3) addend = reloc_addr[!TLSDESC_BACKWARDS];
 			if (def.dso->tls_id > static_tls_cnt) {
 				struct td_index *new = malloc(sizeof *new);
 				if (!new) {
@@ -494,13 +539,13 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
 					+ addend;
 #endif
 			}
-#ifdef TLSDESC_BACKWARDS
 			/* Some archs (32-bit ARM at least) invert the order of
 			 * the descriptor members. Fix them up here. */
-			size_t tmp = reloc_addr[0];
-			reloc_addr[0] = reloc_addr[1];
-			reloc_addr[1] = tmp;
-#endif
+			if (TLSDESC_BACKWARDS) {
+				size_t tmp = reloc_addr[0];
+				reloc_addr[0] = reloc_addr[1];
+				reloc_addr[1] = tmp;
+			}
 			break;
 		default:
 			error("Error relocating %s: unsupported relocation type %d",
@@ -511,6 +556,24 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
 	}
 }
 
+static void do_relr_relocs(struct dso *dso, size_t *relr, size_t relr_size)
+{
+	if (dso == &ldso) return; /* self-relocation was done in _dlstart */
+	unsigned char *base = dso->base;
+	size_t *reloc_addr;
+	for (; relr_size; relr++, relr_size-=sizeof(size_t))
+		if ((relr[0]&1) == 0) {
+			reloc_addr = laddr(dso, relr[0]);
+			*reloc_addr++ += (size_t)base;
+		} else {
+			int i = 0;
+			for (size_t bitmap=relr[0]; (bitmap>>=1); i++)
+				if (bitmap&1)
+					reloc_addr[i] += (size_t)base;
+			reloc_addr += 8*sizeof(size_t)-1;
+		}
+}
+
 static void redo_lazy_relocs()
 {
 	struct dso *p = lazy_head, *next;
@@ -553,16 +616,32 @@ static void reclaim_gaps(struct dso *dso)
 	for (; phcnt--; ph=(void *)((char *)ph+dso->phentsize)) {
 		if (ph->p_type!=PT_LOAD) continue;
 		if ((ph->p_flags&(PF_R|PF_W))!=(PF_R|PF_W)) continue;
+		if (ph->p_memsz == 0) continue;
 		reclaim(dso, ph->p_vaddr & -PAGE_SIZE, ph->p_vaddr);
 		reclaim(dso, ph->p_vaddr+ph->p_memsz,
 			ph->p_vaddr+ph->p_memsz+PAGE_SIZE-1 & -PAGE_SIZE);
 	}
 }
 
+static ssize_t read_loop(int fd, void *p, size_t n)
+{
+	for (size_t i=0; i<n; ) {
+		ssize_t l = read(fd, (char *)p+i, n-i);
+		if (l<0) {
+			if (errno==EINTR) continue;
+			else return -1;
+		}
+		if (l==0) return i;
+		i += l;
+	}
+	return n;
+}
+
 static void *mmap_fixed(void *p, size_t n, int prot, int flags, int fd, off_t off)
 {
 	static int no_map_fixed;
 	char *q;
+	if (!n) return p;
 	if (!no_map_fixed) {
 		q = mmap(p, n, prot, flags|MAP_FIXED, fd, off);
 		if (!DL_NOMMU_SUPPORT || q != MAP_FAILED || errno != EINVAL)
@@ -849,7 +928,7 @@ static int fixup_rpath(struct dso *p, char *buf, size_t buf_size)
 		case ENOENT:
 		case ENOTDIR:
 		case EACCES:
-			break;
+			return 0;
 		default:
 			return -1;
 		}
@@ -1063,13 +1142,17 @@ static struct dso *load_library(const char *name, struct dso *needed_by)
 				snprintf(etc_ldso_path, sizeof etc_ldso_path,
 					"%.*s/etc/ld-musl-" LDSO_ARCH ".path",
 					(int)prefix_len, prefix);
-				FILE *f = fopen(etc_ldso_path, "rbe");
-				if (f) {
-					if (getdelim(&sys_path, (size_t[1]){0}, 0, f) <= 0) {
+				fd = open(etc_ldso_path, O_RDONLY|O_CLOEXEC);
+				if (fd>=0) {
+					size_t n = 0;
+					if (!fstat(fd, &st)) n = st.st_size;
+					if ((sys_path = malloc(n+1)))
+						sys_path[n] = 0;
+					if (!sys_path || read_loop(fd, sys_path, n)<0) {
 						free(sys_path);
 						sys_path = "";
 					}
-					fclose(f);
+					close(fd);
 				} else if (errno != ENOENT) {
 					sys_path = "";
 				}
@@ -1334,13 +1417,17 @@ static void reloc_all(struct dso *p)
 			2+(dyn[DT_PLTREL]==DT_RELA));
 		do_relocs(p, laddr(p, dyn[DT_REL]), dyn[DT_RELSZ], 2);
 		do_relocs(p, laddr(p, dyn[DT_RELA]), dyn[DT_RELASZ], 3);
-
-		if (head != &ldso && p->relro_start != p->relro_end &&
-		    mprotect(laddr(p, p->relro_start), p->relro_end-p->relro_start, PROT_READ)
-		    && errno != ENOSYS) {
-			error("Error relocating %s: RELRO protection failed: %m",
-				p->name);
-			if (runtime) longjmp(*rtld_fail, 1);
+		if (!DL_FDPIC)
+			do_relr_relocs(p, laddr(p, dyn[DT_RELR]), dyn[DT_RELRSZ]);
+
+		if (head != &ldso && p->relro_start != p->relro_end) {
+			long ret = __syscall(SYS_mprotect, laddr(p, p->relro_start),
+				p->relro_end-p->relro_start, PROT_READ);
+			if (ret != 0 && ret != -ENOSYS) {
+				error("Error relocating %s: RELRO protection failed: %m",
+					p->name);
+				if (runtime) longjmp(*rtld_fail, 1);
+			}
 		}
 
 		p->relocated = 1;
@@ -1381,7 +1468,7 @@ void __libc_exit_fini()
 {
 	struct dso *p;
 	size_t dyn[DYN_CNT];
-	int self = __pthread_self()->tid;
+	pthread_t self = __pthread_self();
 
 	/* Take both locks before setting shutting_down, so that
 	 * either lock is sufficient to read its value. The lock
@@ -1407,6 +1494,17 @@ void __libc_exit_fini()
 	}
 }
 
+void __ldso_atfork(int who)
+{
+	if (who<0) {
+		pthread_rwlock_wrlock(&lock);
+		pthread_mutex_lock(&init_fini_lock);
+	} else {
+		pthread_mutex_unlock(&init_fini_lock);
+		pthread_rwlock_unlock(&lock);
+	}
+}
+
 static struct dso **queue_ctors(struct dso *dso)
 {
 	size_t cnt, qpos, spos, i;
@@ -1465,6 +1563,13 @@ static struct dso **queue_ctors(struct dso *dso)
 	}
 	queue[qpos] = 0;
 	for (i=0; i<qpos; i++) queue[i]->mark = 0;
+	for (i=0; i<qpos; i++)
+		if (queue[i]->ctor_visitor && queue[i]->ctor_visitor->tid < 0) {
+			error("State of %s is inconsistent due to multithreaded fork\n",
+				queue[i]->name);
+			free(queue);
+			if (runtime) longjmp(*rtld_fail, 1);
+		}
 
 	return queue;
 }
@@ -1473,7 +1578,7 @@ static void do_init_fini(struct dso **queue)
 {
 	struct dso *p;
 	size_t dyn[DYN_CNT], i;
-	int self = __pthread_self()->tid;
+	pthread_t self = __pthread_self();
 
 	pthread_mutex_lock(&init_fini_lock);
 	for (i=0; (p=queue[i]); i++) {
@@ -1582,7 +1687,7 @@ static void install_new_tls(void)
 
 	/* Install new dtv for each thread. */
 	for (j=0, td=self; !j || td!=self; j++, td=td->next) {
-		td->dtv = td->dtv_copy = newdtv[j];
+		td->dtv = newdtv[j];
 	}
 
 	__tl_unlock();
@@ -1620,11 +1725,12 @@ hidden void __dls2(unsigned char *base, size_t *sp)
 	} else {
 		ldso.base = base;
 	}
-	Ehdr *ehdr = (void *)ldso.base;
+	Ehdr *ehdr = __ehdr_start ? (void *)__ehdr_start : (void *)ldso.base;
 	ldso.name = ldso.shortname = "libc.so";
 	ldso.phnum = ehdr->e_phnum;
 	ldso.phdr = laddr(&ldso, ehdr->e_phoff);
 	ldso.phentsize = ehdr->e_phentsize;
+	search_vec(auxv, &ldso_page_size, AT_PAGESZ);
 	kernel_mapped_dso(&ldso);
 	decode_dyn(&ldso);
 
@@ -1717,6 +1823,9 @@ void __dls3(size_t *sp, size_t *auxv)
 		env_preload = getenv("LD_PRELOAD");
 	}
 
+	/* Activate error handler function */
+	error = error_impl;
+
 	/* If the main program was already loaded by the kernel,
 	 * AT_PHDR will point to some location other than the dynamic
 	 * linker's program headers. */
@@ -1792,7 +1901,7 @@ void __dls3(size_t *sp, size_t *auxv)
 			dprintf(2, "%s: cannot load %s: %s\n", ldname, argv[0], strerror(errno));
 			_exit(1);
 		}
-		Ehdr *ehdr = (void *)map_library(fd, &app);
+		Ehdr *ehdr = map_library(fd, &app);
 		if (!ehdr) {
 			dprintf(2, "%s: %s: Not a valid dynamic program\n", ldname, argv[0]);
 			_exit(1);
@@ -1884,6 +1993,10 @@ void __dls3(size_t *sp, size_t *auxv)
 			size_t *ptr = (size_t *) app.dynv[i+1];
 			*ptr = (size_t)&debug;
 		}
+		if (app.dynv[i]==DT_DEBUG_INDIRECT_REL) {
+			size_t *ptr = (size_t *)((size_t)&app.dynv[i] + app.dynv[i+1]);
+			*ptr = (size_t)&debug;
+		}
 	}
 
 	/* This must be done before final relocations, since it calls
@@ -1938,6 +2051,8 @@ void __dls3(size_t *sp, size_t *auxv)
 	 * possibility of incomplete replacement. */
 	if (find_sym(head, "malloc", 1).dso != &ldso)
 		__malloc_replaced = 1;
+	if (find_sym(head, "aligned_alloc", 1).dso != &ldso)
+		__aligned_alloc_replaced = 1;
 
 	/* Switch to runtime mode: any further failures in the dynamic
 	 * linker are a reportable failure rather than a fatal startup
@@ -1948,7 +2063,7 @@ void __dls3(size_t *sp, size_t *auxv)
 	debug.bp = dl_debug_state;
 	debug.head = head;
 	debug.base = ldso.base;
-	debug.state = 0;
+	debug.state = RT_CONSISTENT;
 	_dl_debug_state();
 
 	if (replace_argv0) argv[0] = replace_argv0;
@@ -1997,6 +2112,9 @@ void *dlopen(const char *file, int mode)
 	pthread_rwlock_wrlock(&lock);
 	__inhibit_ptc();
 
+	debug.state = RT_ADD;
+	_dl_debug_state();
+
 	p = 0;
 	if (shutting_down) {
 		error("Cannot dlopen while program is exiting.");
@@ -2056,8 +2174,9 @@ void *dlopen(const char *file, int mode)
 	load_deps(p);
 	extend_bfs_deps(p);
 	pthread_mutex_lock(&init_fini_lock);
-	if (!p->constructed) ctor_queue = queue_ctors(p);
+	int constructed = p->constructed;
 	pthread_mutex_unlock(&init_fini_lock);
+	if (!constructed) ctor_queue = queue_ctors(p);
 	if (!p->relocated && (mode & RTLD_LAZY)) {
 		prepare_lazy(p);
 		for (i=0; p->deps[i]; i++)
@@ -2089,9 +2208,10 @@ void *dlopen(const char *file, int mode)
 	update_tls_size();
 	if (tls_cnt != orig_tls_cnt)
 		install_new_tls();
-	_dl_debug_state();
 	orig_tail = tail;
 end:
+	debug.state = RT_CONSISTENT;
+	_dl_debug_state();
 	__release_ptc();
 	if (p) gencnt++;
 	pthread_rwlock_unlock(&lock);
@@ -2285,7 +2405,8 @@ int dl_iterate_phdr(int(*callback)(struct dl_phdr_info *info, size_t size, void
 		info.dlpi_adds      = gencnt;
 		info.dlpi_subs      = 0;
 		info.dlpi_tls_modid = current->tls_id;
-		info.dlpi_tls_data  = current->tls.image;
+		info.dlpi_tls_data = !current->tls_id ? 0 :
+			__tls_get_addr((tls_mod_off_t[]){current->tls_id,0});
 
 		ret = (callback)(&info, sizeof (info), data);
 
@@ -2298,7 +2419,7 @@ int dl_iterate_phdr(int(*callback)(struct dl_phdr_info *info, size_t size, void
 	return ret;
 }
 
-static void error(const char *fmt, ...)
+static void error_impl(const char *fmt, ...)
 {
 	va_list ap;
 	va_start(ap, fmt);
@@ -2312,3 +2433,7 @@ static void error(const char *fmt, ...)
 	__dl_vseterr(fmt, ap);
 	va_end(ap);
 }
+
+static void error_noop(const char *fmt, ...)
+{
+}
diff --git a/src/aio/aio.c b/src/aio/aio.c
index 6d34fa86..d7e063bf 100644
--- a/src/aio/aio.c
+++ b/src/aio/aio.c
@@ -9,6 +9,12 @@
 #include "syscall.h"
 #include "atomic.h"
 #include "pthread_impl.h"
+#include "aio_impl.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc __libc_realloc
+#define free __libc_free
 
 /* The following is a threads-based implementation of AIO with minimal
  * dependence on implementation details. Most synchronization is
@@ -70,8 +76,14 @@ static struct aio_queue *****map;
 static volatile int aio_fd_cnt;
 volatile int __aio_fut;
 
+static size_t io_thread_stack_size;
+
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
 static struct aio_queue *__aio_get_queue(int fd, int need)
 {
+	sigset_t allmask, origmask;
+	int masked = 0;
 	if (fd < 0) {
 		errno = EBADF;
 		return 0;
@@ -83,7 +95,14 @@ static struct aio_queue *__aio_get_queue(int fd, int need)
 	if ((!map || !map[a] || !map[a][b] || !map[a][b][c] || !(q=map[a][b][c][d])) && need) {
 		pthread_rwlock_unlock(&maplock);
 		if (fcntl(fd, F_GETFD) < 0) return 0;
+		sigfillset(&allmask);
+		masked = 1;
+		pthread_sigmask(SIG_BLOCK, &allmask, &origmask);
 		pthread_rwlock_wrlock(&maplock);
+		if (!io_thread_stack_size) {
+			unsigned long val = __getauxval(AT_MINSIGSTKSZ);
+			io_thread_stack_size = MAX(MINSIGSTKSZ+2048, val+512);
+		}
 		if (!map) map = calloc(sizeof *map, (-1U/2+1)>>24);
 		if (!map) goto out;
 		if (!map[a]) map[a] = calloc(sizeof **map, 256);
@@ -105,6 +124,7 @@ static struct aio_queue *__aio_get_queue(int fd, int need)
 	if (q) pthread_mutex_lock(&q->lock);
 out:
 	pthread_rwlock_unlock(&maplock);
+	if (masked) pthread_sigmask(SIG_SETMASK, &origmask, 0);
 	return q;
 }
 
@@ -259,15 +279,6 @@ static void *io_thread_func(void *ctx)
 	return 0;
 }
 
-static size_t io_thread_stack_size = MINSIGSTKSZ+2048;
-static pthread_once_t init_stack_size_once;
-
-static void init_stack_size()
-{
-	unsigned long val = __getauxval(AT_MINSIGSTKSZ);
-	if (val > MINSIGSTKSZ) io_thread_stack_size = val + 512;
-}
-
 static int submit(struct aiocb *cb, int op)
 {
 	int ret = 0;
@@ -293,7 +304,6 @@ static int submit(struct aiocb *cb, int op)
 		else
 			pthread_attr_init(&a);
 	} else {
-		pthread_once(&init_stack_size_once, init_stack_size);
 		pthread_attr_init(&a);
 		pthread_attr_setstacksize(&a, io_thread_stack_size);
 		pthread_attr_setguardsize(&a, 0);
@@ -392,9 +402,31 @@ int __aio_close(int fd)
 	return fd;
 }
 
-weak_alias(aio_cancel, aio_cancel64);
-weak_alias(aio_error, aio_error64);
-weak_alias(aio_fsync, aio_fsync64);
-weak_alias(aio_read, aio_read64);
-weak_alias(aio_write, aio_write64);
-weak_alias(aio_return, aio_return64);
+void __aio_atfork(int who)
+{
+	if (who<0) {
+		pthread_rwlock_rdlock(&maplock);
+		return;
+	} else if (!who) {
+		pthread_rwlock_unlock(&maplock);
+		return;
+	}
+	aio_fd_cnt = 0;
+	if (pthread_rwlock_tryrdlock(&maplock)) {
+		/* Obtaining lock may fail if _Fork was called nor via
+		 * fork. In this case, no further aio is possible from
+		 * child and we can just null out map so __aio_close
+		 * does not attempt to do anything. */
+		map = 0;
+		return;
+	}
+	if (map) for (int a=0; a<(-1U/2+1)>>24; a++)
+		if (map[a]) for (int b=0; b<256; b++)
+			if (map[a][b]) for (int c=0; c<256; c++)
+				if (map[a][b][c]) for (int d=0; d<256; d++)
+					map[a][b][c][d] = 0;
+	/* Re-initialize the rwlock rather than unlocking since there
+	 * may have been more than one reference on it in the parent.
+	 * We are not a lock holder anyway; the thread in the parent was. */
+	pthread_rwlock_init(&maplock, 0);
+}
diff --git a/src/aio/aio_suspend.c b/src/aio/aio_suspend.c
index 34b66f87..1f0c9aaa 100644
--- a/src/aio/aio_suspend.c
+++ b/src/aio/aio_suspend.c
@@ -3,12 +3,13 @@
 #include <time.h>
 #include "atomic.h"
 #include "pthread_impl.h"
+#include "aio_impl.h"
 
 int aio_suspend(const struct aiocb *const cbs[], int cnt, const struct timespec *ts)
 {
 	int i, tid = 0, ret, expect = 0;
 	struct timespec at;
-	volatile int dummy_fut, *pfut;
+	volatile int dummy_fut = 0, *pfut;
 	int nzcnt = 0;
 	const struct aiocb *cb = 0;
 
@@ -72,7 +73,3 @@ int aio_suspend(const struct aiocb *const cbs[], int cnt, const struct timespec
 		}
 	}
 }
-
-#if !_REDIR_TIME64
-weak_alias(aio_suspend, aio_suspend64);
-#endif
diff --git a/src/aio/lio_listio.c b/src/aio/lio_listio.c
index 0799c15d..a672812f 100644
--- a/src/aio/lio_listio.c
+++ b/src/aio/lio_listio.c
@@ -139,5 +139,3 @@ int lio_listio(int mode, struct aiocb *restrict const *restrict cbs, int cnt, st
 
 	return 0;
 }
-
-weak_alias(lio_listio, lio_listio64);
diff --git a/src/complex/cacosf.c b/src/complex/cacosf.c
index 2e048540..ed8acf0f 100644
--- a/src/complex/cacosf.c
+++ b/src/complex/cacosf.c
@@ -2,8 +2,10 @@
 
 // FIXME
 
+static const float float_pi_2 = M_PI_2;
+
 float complex cacosf(float complex z)
 {
 	z = casinf(z);
-	return CMPLXF((float)M_PI_2 - crealf(z), -cimagf(z));
+	return CMPLXF(float_pi_2 - crealf(z), -cimagf(z));
 }
diff --git a/src/complex/cacosh.c b/src/complex/cacosh.c
index 76127f75..55b857ce 100644
--- a/src/complex/cacosh.c
+++ b/src/complex/cacosh.c
@@ -1,6 +1,6 @@
 #include "complex_impl.h"
 
-/* acosh(z) = i acos(z) */
+/* acosh(z) = ±i acos(z) */
 
 double complex cacosh(double complex z)
 {
diff --git a/src/complex/catan.c b/src/complex/catan.c
index ccc2fb53..b4fe552a 100644
--- a/src/complex/catan.c
+++ b/src/complex/catan.c
@@ -60,29 +60,6 @@
 
 #include "complex_impl.h"
 
-#define MAXNUM 1.0e308
-
-static const double DP1 = 3.14159265160560607910E0;
-static const double DP2 = 1.98418714791870343106E-9;
-static const double DP3 = 1.14423774522196636802E-17;
-
-static double _redupi(double x)
-{
-	double t;
-	long i;
-
-	t = x/M_PI;
-	if (t >= 0.0)
-		t += 0.5;
-	else
-		t -= 0.5;
-
-	i = t;  /* the multiple */
-	t = i;
-	t = ((x - t * DP1) - t * DP2) - t * DP3;
-	return t;
-}
-
 double complex catan(double complex z)
 {
 	double complex w;
@@ -95,7 +72,7 @@ double complex catan(double complex z)
 	a = 1.0 - x2 - (y * y);
 
 	t = 0.5 * atan2(2.0 * x, a);
-	w = _redupi(t);
+	w = t;
 
 	t = y - 1.0;
 	a = x2 + (t * t);
diff --git a/src/complex/catanf.c b/src/complex/catanf.c
index ef3907a5..faaa907a 100644
--- a/src/complex/catanf.c
+++ b/src/complex/catanf.c
@@ -55,30 +55,6 @@
 
 #include "complex_impl.h"
 
-#define MAXNUMF 1.0e38F
-
-static const double DP1 = 3.140625;
-static const double DP2 = 9.67502593994140625E-4;
-static const double DP3 = 1.509957990978376432E-7;
-
-static float _redupif(float xx)
-{
-	float x, t;
-	long i;
-
-	x = xx;
-	t = x/(float)M_PI;
-	if (t >= 0.0f)
-		t += 0.5f;
-	else
-		t -= 0.5f;
-
-	i = t;  /* the multiple */
-	t = i;
-	t = ((x - t * DP1) - t * DP2) - t * DP3;
-	return t;
-}
-
 float complex catanf(float complex z)
 {
 	float complex w;
@@ -91,7 +67,7 @@ float complex catanf(float complex z)
 	a = 1.0f - x2 - (y * y);
 
 	t = 0.5f * atan2f(2.0f * x, a);
-	w = _redupif(t);
+	w = t;
 
 	t = y - 1.0f;
 	a = x2 + (t * t);
diff --git a/src/complex/catanl.c b/src/complex/catanl.c
index e62526c0..cd2d2b00 100644
--- a/src/complex/catanl.c
+++ b/src/complex/catanl.c
@@ -67,28 +67,6 @@ long double complex catanl(long double complex z)
 	return catan(z);
 }
 #else
-static const long double PIL = 3.141592653589793238462643383279502884197169L;
-static const long double DP1 = 3.14159265358979323829596852490908531763125L;
-static const long double DP2 = 1.6667485837041756656403424829301998703007e-19L;
-static const long double DP3 = 1.8830410776607851167459095484560349402753e-39L;
-
-static long double redupil(long double x)
-{
-	long double t;
-	long i;
-
-	t = x / PIL;
-	if (t >= 0.0L)
-		t += 0.5L;
-	else
-		t -= 0.5L;
-
-	i = t;  /* the multiple */
-	t = i;
-	t = ((x - t * DP1) - t * DP2) - t * DP3;
-	return t;
-}
-
 long double complex catanl(long double complex z)
 {
 	long double complex w;
@@ -101,7 +79,7 @@ long double complex catanl(long double complex z)
 	a = 1.0L - x2 - (y * y);
 
 	t = atan2l(2.0L * x, a) * 0.5L;
-	w = redupil(t);
+	w = t;
 
 	t = y - 1.0L;
 	a = x2 + (t * t);
diff --git a/src/complex/cproj.c b/src/complex/cproj.c
index 9ae1e17c..d2b8f5a9 100644
--- a/src/complex/cproj.c
+++ b/src/complex/cproj.c
@@ -3,6 +3,6 @@
 double complex cproj(double complex z)
 {
 	if (isinf(creal(z)) || isinf(cimag(z)))
-		return CMPLX(INFINITY, copysign(0.0, creal(z)));
+		return CMPLX(INFINITY, copysign(0.0, cimag(z)));
 	return z;
 }
diff --git a/src/complex/cprojf.c b/src/complex/cprojf.c
index 03fab339..15a874bb 100644
--- a/src/complex/cprojf.c
+++ b/src/complex/cprojf.c
@@ -3,6 +3,6 @@
 float complex cprojf(float complex z)
 {
 	if (isinf(crealf(z)) || isinf(cimagf(z)))
-		return CMPLXF(INFINITY, copysignf(0.0, crealf(z)));
+		return CMPLXF(INFINITY, copysignf(0.0, cimagf(z)));
 	return z;
 }
diff --git a/src/complex/cprojl.c b/src/complex/cprojl.c
index 38a494c5..531ffa1c 100644
--- a/src/complex/cprojl.c
+++ b/src/complex/cprojl.c
@@ -9,7 +9,7 @@ long double complex cprojl(long double complex z)
 long double complex cprojl(long double complex z)
 {
 	if (isinf(creall(z)) || isinf(cimagl(z)))
-		return CMPLXL(INFINITY, copysignl(0.0, creall(z)));
+		return CMPLXL(INFINITY, copysignl(0.0, cimagl(z)));
 	return z;
 }
 #endif
diff --git a/src/conf/confstr.c b/src/conf/confstr.c
index 02cb1aa2..3d417284 100644
--- a/src/conf/confstr.c
+++ b/src/conf/confstr.c
@@ -7,7 +7,7 @@ size_t confstr(int name, char *buf, size_t len)
 	const char *s = "";
 	if (!name) {
 		s = "/bin:/usr/bin";
-	} else if ((name&~4U)!=1 && name-_CS_POSIX_V6_ILP32_OFF32_CFLAGS>33U) {
+	} else if ((name&~4U)!=1 && name-_CS_POSIX_V6_ILP32_OFF32_CFLAGS>35U) {
 		errno = EINVAL;
 		return 0;
 	}
diff --git a/src/conf/sysconf.c b/src/conf/sysconf.c
index 3baaed32..8dd5c725 100644
--- a/src/conf/sysconf.c
+++ b/src/conf/sysconf.c
@@ -4,6 +4,7 @@
 #include <sys/resource.h>
 #include <signal.h>
 #include <sys/sysinfo.h>
+#include <sys/auxv.h>
 #include "syscall.h"
 #include "libc.h"
 
@@ -19,6 +20,8 @@
 #define JT_AVPHYS_PAGES JT(9)
 #define JT_ZERO JT(10)
 #define JT_DELAYTIMER_MAX JT(11)
+#define JT_MINSIGSTKSZ JT(12)
+#define JT_SIGSTKSZ JT(13)
 
 #define RLIM(x) (-32768|(RLIMIT_ ## x))
 
@@ -165,6 +168,9 @@ long sysconf(int name)
 		[_SC_XOPEN_STREAMS] = JT_ZERO,
 		[_SC_THREAD_ROBUST_PRIO_INHERIT] = -1,
 		[_SC_THREAD_ROBUST_PRIO_PROTECT] = -1,
+
+		[_SC_MINSIGSTKSZ] = JT_MINSIGSTKSZ,
+		[_SC_SIGSTKSZ] = JT_SIGSTKSZ,
 	};
 
 	if (name >= sizeof(values)/sizeof(values[0]) || !values[name]) {
@@ -212,6 +218,18 @@ long sysconf(int name)
 		mem *= si.mem_unit;
 		mem /= PAGE_SIZE;
 		return (mem > LONG_MAX) ? LONG_MAX : mem;
+	case JT_MINSIGSTKSZ & 255:
+	case JT_SIGSTKSZ & 255: ;
+		/* Value from auxv/kernel is only sigfame size. Clamp it
+		 * to at least 1k below arch's traditional MINSIGSTKSZ,
+		 * then add 1k of working space for signal handler. */
+		unsigned long sigframe_sz = __getauxval(AT_MINSIGSTKSZ);
+		if (sigframe_sz < MINSIGSTKSZ - 1024)
+			sigframe_sz = MINSIGSTKSZ - 1024;
+		unsigned val = sigframe_sz + 1024;
+		if (values[name] == JT_SIGSTKSZ)
+			val += SIGSTKSZ - MINSIGSTKSZ;
+		return val;
 	case JT_ZERO & 255:
 		return 0;
 	}
diff --git a/src/crypt/crypt_blowfish.c b/src/crypt/crypt_blowfish.c
index d3f79851..d722607b 100644
--- a/src/crypt/crypt_blowfish.c
+++ b/src/crypt/crypt_blowfish.c
@@ -15,7 +15,7 @@
  * No copyright is claimed, and the software is hereby placed in the public
  * domain.  In case this attempt to disclaim copyright and place the software
  * in the public domain is deemed null and void, then the software is
- * Copyright (c) 1998-2012 Solar Designer and it is hereby released to the
+ * Copyright (c) 1998-2014 Solar Designer and it is hereby released to the
  * general public under the following terms:
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,12 +31,12 @@
  * you place this code and any modifications you make under a license
  * of your choice.
  *
- * This implementation is mostly compatible with OpenBSD's bcrypt.c (prefix
- * "$2a$") by Niels Provos <provos at citi.umich.edu>, and uses some of his
- * ideas.  The password hashing algorithm was designed by David Mazieres
- * <dm at lcs.mit.edu>.  For more information on the level of compatibility,
- * please refer to the comments in BF_set_key() below and to the included
- * crypt(3) man page.
+ * This implementation is fully compatible with OpenBSD's bcrypt.c for prefix
+ * "$2b$", originally by Niels Provos <provos at citi.umich.edu>, and it uses
+ * some of his ideas.  The password hashing algorithm was designed by David
+ * Mazieres <dm at lcs.mit.edu>.  For information on the level of
+ * compatibility for bcrypt hash prefixes other than "$2b$", please refer to
+ * the comments in BF_set_key() below and to the included crypt(3) man page.
  *
  * There's a paper on the algorithm that explains its design decisions:
  *
@@ -533,6 +533,7 @@ static void BF_set_key(const char *key, BF_key expanded, BF_key initial,
  * Valid combinations of settings are:
  *
  * Prefix "$2a$": bug = 0, safety = 0x10000
+ * Prefix "$2b$": bug = 0, safety = 0
  * Prefix "$2x$": bug = 1, safety = 0
  * Prefix "$2y$": bug = 0, safety = 0
  */
@@ -596,12 +597,14 @@ static void BF_set_key(const char *key, BF_key expanded, BF_key initial,
 	initial[0] ^= sign;
 }
 
+static const unsigned char flags_by_subtype[26] = {
+	2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0
+};
+
 static char *BF_crypt(const char *key, const char *setting,
 	char *output, BF_word min)
 {
-	static const unsigned char flags_by_subtype[26] =
-		{2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0};
 	struct {
 		BF_ctx ctx;
 		BF_key expanded_key;
@@ -746,9 +749,11 @@ char *__crypt_blowfish(const char *key, const char *setting, char *output)
 {
 	const char *test_key = "8b \xd0\xc1\xd2\xcf\xcc\xd8";
 	const char *test_setting = "$2a$00$abcdefghijklmnopqrstuu";
-	static const char test_hash[2][34] =
-		{"VUrPmXD6q/nVSSp7pNDhCR9071IfIRe\0\x55", /* $2x$ */
-		"i1D709vfamulimlGcq0qq3UvuUasvEa\0\x55"}; /* $2a$, $2y$ */
+	static const char test_hashes[2][34] = {
+		"i1D709vfamulimlGcq0qq3UvuUasvEa\0\x55", /* 'a', 'b', 'y' */
+		"VUrPmXD6q/nVSSp7pNDhCR9071IfIRe\0\x55", /* 'x' */
+	};
+	const char *test_hash = test_hashes[0];
 	char *retval;
 	const char *p;
 	int ok;
@@ -768,8 +773,11 @@ char *__crypt_blowfish(const char *key, const char *setting, char *output)
  * detected by the self-test.
  */
 	memcpy(buf.s, test_setting, sizeof(buf.s));
-	if (retval)
+	if (retval) {
+		unsigned int flags = flags_by_subtype[setting[2] - 'a'];
+		test_hash = test_hashes[flags & 1];
 		buf.s[2] = setting[2];
+	}
 	memset(buf.o, 0x55, sizeof(buf.o));
 	buf.o[sizeof(buf.o) - 1] = 0;
 	p = BF_crypt(test_key, buf.s, buf.o, 1);
@@ -777,7 +785,7 @@ char *__crypt_blowfish(const char *key, const char *setting, char *output)
 	ok = (p == buf.o &&
 	    !memcmp(p, buf.s, 7 + 22) &&
 	    !memcmp(p + (7 + 22),
-	    test_hash[buf.s[2] & 1],
+	    test_hash,
 	    31 + 1 + 1 + 1));
 
 	{
diff --git a/src/ctype/nonspacing.h b/src/ctype/nonspacing.h
index 5d05a3d1..7746f3b6 100644
--- a/src/ctype/nonspacing.h
+++ b/src/ctype/nonspacing.h
@@ -1,23 +1,23 @@
-16,16,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,16,32,16,16,16,33,34,35,
-36,37,38,39,16,16,40,16,16,16,16,16,16,16,16,16,16,16,41,42,16,16,43,16,16,16,
+16,16,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,16,33,16,16,16,34,35,36,
+37,38,39,40,16,16,41,16,16,16,16,16,16,16,16,16,16,16,42,43,16,16,44,16,16,16,
 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,44,16,45,46,47,48,16,16,16,16,16,16,16,16,16,16,
+16,16,16,16,16,16,16,16,16,16,45,16,46,47,48,49,16,16,16,16,16,16,16,16,16,16,
 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+16,16,16,16,16,16,16,50,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,51,16,16,52,
+53,16,54,55,56,16,16,16,16,16,16,57,16,16,58,16,59,60,61,62,63,64,65,66,67,68,
+69,70,16,71,72,73,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+16,74,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,49,16,16,50,
-51,16,52,53,54,16,16,16,16,16,16,55,16,16,56,16,57,58,59,60,61,62,63,64,65,66,
-67,68,16,69,70,71,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,72,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+16,16,16,75,76,16,16,16,77,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,73,74,16,16,16,75,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,16,16,16,16,16,76,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
-16,16,77,78,16,16,16,16,16,16,16,79,16,16,16,16,16,80,81,82,16,16,16,16,16,83,
-84,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+16,16,16,16,16,16,16,78,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+16,16,79,80,16,16,16,16,16,16,16,81,16,16,16,16,16,82,83,84,16,16,16,16,16,85,
+86,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
 16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
@@ -35,55 +35,57 @@
 242,7,128,127,0,0,0,0,0,0,0,0,0,0,0,0,242,31,0,63,0,0,0,0,0,0,0,0,0,3,0,0,160,
 2,0,0,0,0,0,0,254,127,223,224,255,254,255,255,255,31,64,0,0,0,0,0,0,0,0,0,0,0,
 0,224,253,102,0,0,0,195,1,0,30,0,100,32,0,32,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,224,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0,
-0,0,28,0,0,0,12,0,0,0,12,0,0,0,0,0,0,0,176,63,64,254,15,32,0,0,0,0,0,120,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,135,1,4,14,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,9,0,0,0,0,0,0,64,127,
-229,31,248,159,0,0,0,0,0,0,255,127,0,0,0,0,0,0,0,0,15,0,0,0,0,0,208,23,4,0,0,
-0,0,248,15,0,3,0,0,0,60,59,0,0,0,0,0,0,64,163,3,0,0,0,0,0,0,240,207,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,247,255,253,33,16,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,
+0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,224,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,28,0,0,0,28,0,0,0,12,0,0,0,12,0,0,0,0,0,0,0,176,63,64,254,
+15,32,0,0,0,0,0,120,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,0,2,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,135,1,4,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+128,9,0,0,0,0,0,0,64,127,229,31,248,159,0,0,0,0,0,0,255,127,0,0,0,0,0,0,0,0,
+15,0,0,0,0,0,208,23,4,0,0,0,0,248,15,0,3,0,0,0,60,59,0,0,0,0,0,0,64,163,3,0,0,
+0,0,0,0,240,207,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,247,255,253,33,16,
+3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,
 251,0,248,0,0,0,124,0,0,0,0,0,0,223,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,
 255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,3,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0,
 0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,128,247,63,0,0,0,192,0,0,0,0,0,0,0,0,0,0,3,0,68,8,0,0,96,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0,255,255,3,128,0,0,0,0,192,63,0,0,128,255,3,0,
-0,0,0,0,7,0,0,0,0,0,200,51,0,0,0,0,32,0,0,0,0,0,0,0,0,126,102,0,8,16,0,0,0,0,
-0,16,0,0,0,0,0,0,157,193,2,0,0,0,0,48,64,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,33,0,0,0,0,0,64,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,110,240,0,0,0,0,0,135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,
-0,0,0,0,0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,192,255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,255,
-127,0,0,0,0,0,0,128,3,0,0,0,0,0,120,38,0,32,0,0,0,0,0,0,7,0,0,0,128,239,31,0,
-0,0,0,0,0,0,8,0,3,0,0,0,0,0,192,127,0,30,0,0,0,0,0,0,0,0,0,0,0,128,211,64,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,248,7,0,0,3,0,0,0,0,0,0,24,1,0,0,0,192,
-31,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,92,0,0,64,0,0,0,0,0,
-0,0,0,0,0,248,133,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,60,176,1,0,0,48,0,0,0,
-0,0,0,0,0,0,0,248,167,1,0,0,0,0,0,0,0,0,0,0,0,0,40,191,0,0,0,0,0,0,0,0,0,0,0,
-0,224,188,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-128,255,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,240,12,1,0,0,0,254,7,0,0,0,0,248,121,128,0,
-126,14,0,0,0,0,0,252,127,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,191,0,0,0,
-0,0,0,0,0,0,0,252,255,255,252,109,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,126,180,191,0,
-0,0,0,0,0,0,0,0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,
-0,0,0,0,0,0,0,255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,0,0,0,0,127,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,
-0,128,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,15,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,3,248,255,231,15,0,0,0,60,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,255,255,255,255,255,255,127,248,255,255,255,255,255,31,32,0,16,0,0,248,
-254,255,0,0,0,0,0,0,0,0,0,
-0,127,255,255,249,219,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,240,7,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,7,0,0,0,0,0,200,51,0,0,0,0,32,0,0,
+0,0,0,0,0,0,126,102,0,8,16,0,0,0,0,0,16,0,0,0,0,0,0,157,193,2,0,0,0,0,48,64,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,33,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,0,0,0,
+64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,
+255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,1,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,110,240,0,
+0,0,0,0,135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,0,0,0,0,240,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,255,1,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,255,127,0,0,0,0,0,0,128,
+3,0,0,0,0,0,120,38,0,32,0,0,0,0,0,0,7,0,0,0,128,239,31,0,0,0,0,0,0,0,8,0,3,0,
+0,0,0,0,192,127,0,30,0,0,0,0,0,0,0,0,0,0,0,128,211,64,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,128,248,7,0,0,3,0,0,0,0,0,0,24,1,0,0,0,192,31,31,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,255,92,0,0,64,0,0,0,0,0,0,0,0,0,0,248,133,13,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60,176,1,0,0,48,0,0,0,0,0,0,0,0,0,0,
+248,167,1,0,0,0,0,0,0,0,0,0,0,0,0,40,191,0,0,0,0,0,0,0,0,0,0,0,0,224,188,15,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,255,6,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,240,12,1,0,0,0,254,7,0,0,0,0,248,121,128,0,126,14,0,0,0,0,0,252,
+127,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,191,0,0,0,0,0,0,0,0,0,0,252,255,
+255,252,109,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,126,180,191,0,0,0,0,0,0,0,0,0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,255,
+1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,7,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,15,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,3,248,255,231,15,0,0,0,60,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,
+255,255,255,255,127,248,255,255,255,255,255,31,32,0,16,0,0,248,254,255,0,0,0,
+0,0,0,0,0,0,0,127,255,255,249,219,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,240,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,240,7,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
diff --git a/src/dirent/alphasort.c b/src/dirent/alphasort.c
index bee672eb..ab2624e2 100644
--- a/src/dirent/alphasort.c
+++ b/src/dirent/alphasort.c
@@ -5,5 +5,3 @@ int alphasort(const struct dirent **a, const struct dirent **b)
 {
 	return strcoll((*a)->d_name, (*b)->d_name);
 }
-
-weak_alias(alphasort, alphasort64);
diff --git a/src/dirent/posix_getdents.c b/src/dirent/posix_getdents.c
new file mode 100644
index 00000000..26c16ac6
--- /dev/null
+++ b/src/dirent/posix_getdents.c
@@ -0,0 +1,11 @@
+#include <dirent.h>
+#include <limits.h>
+#include <errno.h>
+#include "syscall.h"
+
+ssize_t posix_getdents(int fd, void *buf, size_t len, int flags)
+{
+	if (flags) return __syscall_ret(-EOPNOTSUPP);
+	if (len>INT_MAX) len = INT_MAX;
+	return syscall(SYS_getdents, fd, buf, len);
+}
diff --git a/src/dirent/readdir.c b/src/dirent/readdir.c
index 569fc705..5a03b363 100644
--- a/src/dirent/readdir.c
+++ b/src/dirent/readdir.c
@@ -25,5 +25,3 @@ struct dirent *readdir(DIR *dir)
 	dir->tell = de->d_off;
 	return de;
 }
-
-weak_alias(readdir, readdir64);
diff --git a/src/dirent/readdir_r.c b/src/dirent/readdir_r.c
index e2a818f3..0d5de5f5 100644
--- a/src/dirent/readdir_r.c
+++ b/src/dirent/readdir_r.c
@@ -25,5 +25,3 @@ int readdir_r(DIR *restrict dir, struct dirent *restrict buf, struct dirent **re
 	*result = buf;
 	return 0;
 }
-
-weak_alias(readdir_r, readdir64_r);
diff --git a/src/dirent/scandir.c b/src/dirent/scandir.c
index 7ee195dd..7456b9b8 100644
--- a/src/dirent/scandir.c
+++ b/src/dirent/scandir.c
@@ -43,5 +43,3 @@ int scandir(const char *path, struct dirent ***res,
 	*res = names;
 	return cnt;
 }
-
-weak_alias(scandir, scandir64);
diff --git a/src/dirent/versionsort.c b/src/dirent/versionsort.c
index d4c48923..97696105 100644
--- a/src/dirent/versionsort.c
+++ b/src/dirent/versionsort.c
@@ -6,6 +6,3 @@ int versionsort(const struct dirent **a, const struct dirent **b)
 {
 	return strverscmp((*a)->d_name, (*b)->d_name);
 }
-
-#undef versionsort64
-weak_alias(versionsort, versionsort64);
diff --git a/src/env/__init_tls.c b/src/env/__init_tls.c
index 772baba3..a93141ed 100644
--- a/src/env/__init_tls.c
+++ b/src/env/__init_tls.c
@@ -67,7 +67,7 @@ void *__copy_tls(unsigned char *mem)
 	}
 #endif
 	dtv[0] = libc.tls_cnt;
-	td->dtv = td->dtv_copy = dtv;
+	td->dtv = dtv;
 	return td;
 }
 
diff --git a/src/env/__libc_start_main.c b/src/env/__libc_start_main.c
index 8fbe5262..c5b277bd 100644
--- a/src/env/__libc_start_main.c
+++ b/src/env/__libc_start_main.c
@@ -69,7 +69,8 @@ weak_alias(libc_start_init, __libc_start_init);
 typedef int lsm2_fn(int (*)(int,char **,char **), int, char **);
 static lsm2_fn libc_start_main_stage2;
 
-int __libc_start_main(int (*main)(int,char **,char **), int argc, char **argv)
+int __libc_start_main(int (*main)(int,char **,char **), int argc, char **argv,
+	void (*init_dummy)(), void(*fini_dummy)(), void(*ldso_dummy)())
 {
 	char **envp = argv+argc+1;
 
diff --git a/src/env/__stack_chk_fail.c b/src/env/__stack_chk_fail.c
index e32596d1..e5352602 100644
--- a/src/env/__stack_chk_fail.c
+++ b/src/env/__stack_chk_fail.c
@@ -9,7 +9,16 @@ void __init_ssp(void *entropy)
 	if (entropy) memcpy(&__stack_chk_guard, entropy, sizeof(uintptr_t));
 	else __stack_chk_guard = (uintptr_t)&__stack_chk_guard * 1103515245;
 
-	__pthread_self()->CANARY = __stack_chk_guard;
+#if UINTPTR_MAX >= 0xffffffffffffffff
+	/* Sacrifice 8 bits of entropy on 64bit to prevent leaking/
+	 * overwriting the canary via string-manipulation functions.
+	 * The NULL byte is on the second byte so that off-by-ones can
+	 * still be detected. Endianness is taken care of
+	 * automatically. */
+	((char *)&__stack_chk_guard)[1] = 0;
+#endif
+
+	__pthread_self()->canary = __stack_chk_guard;
 }
 
 void __stack_chk_fail(void)
diff --git a/src/errno/__strerror.h b/src/errno/__strerror.h
index 2f04d400..0398b5b6 100644
--- a/src/errno/__strerror.h
+++ b/src/errno/__strerror.h
@@ -1,8 +1,9 @@
-/* This file is sorted such that 'errors' which represent exceptional
- * conditions under which a correct program may fail come first, followed
- * by messages that indicate an incorrect program or system failure. The
- * macro E() along with double-inclusion is used to ensure that ordering
- * of the strings remains synchronized. */
+/* The first entry is a catch-all for codes not enumerated here.
+ * This file is included multiple times to declare and define a structure
+ * with these messages, and then to define a lookup table translating
+ * error codes to offsets of corresponding fields in the structure. */
+
+E(0,            "No error information")
 
 E(EILSEQ,       "Illegal byte sequence")
 E(EDOM,         "Domain error")
@@ -96,10 +97,14 @@ E(ESHUTDOWN,    "Cannot send after socket shutdown")
 E(EALREADY,     "Operation already in progress")
 E(EINPROGRESS,  "Operation in progress")
 E(ESTALE,       "Stale file handle")
+E(EUCLEAN,      "Data consistency error")
+E(ENAVAIL,      "Resource not available")
 E(EREMOTEIO,    "Remote I/O error")
 E(EDQUOT,       "Quota exceeded")
 E(ENOMEDIUM,    "No medium found")
 E(EMEDIUMTYPE,  "Wrong medium type")
 E(EMULTIHOP,    "Multihop attempted")
-
-E(0,            "No error information")
+E(ENOKEY,       "Required key not available")
+E(EKEYEXPIRED,  "Key has expired")
+E(EKEYREVOKED,  "Key has been revoked")
+E(EKEYREJECTED, "Key was rejected by service")
diff --git a/src/errno/strerror.c b/src/errno/strerror.c
index e3ed771a..7f926432 100644
--- a/src/errno/strerror.c
+++ b/src/errno/strerror.c
@@ -1,30 +1,41 @@
 #include <errno.h>
+#include <stddef.h>
 #include <string.h>
 #include "locale_impl.h"
 
-#define E(a,b) ((unsigned char)a),
-static const unsigned char errid[] = {
+/* mips has one error code outside of the 8-bit range due to a
+ * historical typo, so we just remap it. */
+#if EDQUOT==1133
+#define EDQUOT_ORIG 1133
+#undef  EDQUOT
+#define EDQUOT 109
+#endif
+
+static const struct errmsgstr_t {
+#define E(n, s) char str##n[sizeof(s)];
+#include "__strerror.h"
+#undef E
+} errmsgstr = {
+#define E(n, s) s,
 #include "__strerror.h"
+#undef E
 };
 
-#undef E
-#define E(a,b) b "\0"
-static const char errmsg[] =
+static const unsigned short errmsgidx[] = {
+#define E(n, s) [n] = offsetof(struct errmsgstr_t, str##n),
 #include "__strerror.h"
-;
+#undef E
+};
 
 char *__strerror_l(int e, locale_t loc)
 {
 	const char *s;
-	int i;
-	/* mips has one error code outside of the 8-bit range due to a
-	 * historical typo, so we just remap it. */
-	if (EDQUOT==1133) {
-		if (e==109) e=-1;
-		else if (e==EDQUOT) e=109;
-	}
-	for (i=0; errid[i] && errid[i] != e; i++);
-	for (s=errmsg; i; s++, i--) for (; *s; s++);
+#ifdef EDQUOT_ORIG
+	if (e==EDQUOT) e=0;
+	else if (e==EDQUOT_ORIG) e=EDQUOT;
+#endif
+	if (e >= sizeof errmsgidx / sizeof *errmsgidx) e = 0;
+	s = (char *)&errmsgstr + errmsgidx[e];
 	return (char *)LCTRANS(s, LC_MESSAGES, loc);
 }
 
diff --git a/src/exit/abort.c b/src/exit/abort.c
index e1980f10..f21f458e 100644
--- a/src/exit/abort.c
+++ b/src/exit/abort.c
@@ -6,8 +6,6 @@
 #include "lock.h"
 #include "ksigaction.h"
 
-hidden volatile int __abort_lock[1];
-
 _Noreturn void abort(void)
 {
 	raise(SIGABRT);
diff --git a/src/exit/abort_lock.c b/src/exit/abort_lock.c
new file mode 100644
index 00000000..3af72c7b
--- /dev/null
+++ b/src/exit/abort_lock.c
@@ -0,0 +1,3 @@
+#include "pthread_impl.h"
+
+volatile int __abort_lock[1];
diff --git a/src/exit/assert.c b/src/exit/assert.c
index 49b0dc3e..94edd827 100644
--- a/src/exit/assert.c
+++ b/src/exit/assert.c
@@ -4,6 +4,5 @@
 _Noreturn void __assert_fail(const char *expr, const char *file, int line, const char *func)
 {
 	fprintf(stderr, "Assertion failed: %s (%s: %s: %d)\n", expr, file, func, line);
-	fflush(NULL);
 	abort();
 }
diff --git a/src/exit/at_quick_exit.c b/src/exit/at_quick_exit.c
index d3ce6522..e4b5d78d 100644
--- a/src/exit/at_quick_exit.c
+++ b/src/exit/at_quick_exit.c
@@ -1,12 +1,14 @@
 #include <stdlib.h>
 #include "libc.h"
 #include "lock.h"
+#include "fork_impl.h"
 
 #define COUNT 32
 
 static void (*funcs[COUNT])(void);
 static int count;
 static volatile int lock[1];
+volatile int *const __at_quick_exit_lockptr = lock;
 
 void __funcs_on_quick_exit()
 {
diff --git a/src/exit/atexit.c b/src/exit/atexit.c
index 160d277a..92c91c9d 100644
--- a/src/exit/atexit.c
+++ b/src/exit/atexit.c
@@ -2,6 +2,12 @@
 #include <stdint.h>
 #include "libc.h"
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc undef
+#define free undef
 
 /* Ensure that at least 32 atexit handlers can be registered without malloc */
 #define COUNT 32
@@ -13,8 +19,10 @@ static struct fl
 	void *a[COUNT];
 } builtin, *head;
 
+static int finished_atexit;
 static int slot;
 static volatile int lock[1];
+volatile int *const __atexit_lockptr = lock;
 
 void __funcs_on_exit()
 {
@@ -27,6 +35,10 @@ void __funcs_on_exit()
 		func(arg);
 		LOCK(lock);
 	}
+	/* Unlock to prevent deadlock if a global dtor
+	 * attempts to call atexit. */
+	finished_atexit = 1;
+	UNLOCK(lock);
 }
 
 void __cxa_finalize(void *dso)
@@ -37,6 +49,13 @@ int __cxa_atexit(void (*func)(void *), void *arg, void *dso)
 {
 	LOCK(lock);
 
+	/* Prevent dtors from registering further atexit
+	 * handlers that would never be run. */
+	if (finished_atexit) {
+		UNLOCK(lock);
+		return -1;
+	}
+
 	/* Defer initialization of head so it can be in BSS */
 	if (!head) head = &builtin;
 
diff --git a/src/exit/exit.c b/src/exit/exit.c
index a6869b37..17b33cc6 100644
--- a/src/exit/exit.c
+++ b/src/exit/exit.c
@@ -1,6 +1,9 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include "libc.h"
+#include "pthread_impl.h"
+#include "atomic.h"
+#include "syscall.h"
 
 static void dummy()
 {
@@ -26,6 +29,17 @@ weak_alias(libc_exit_fini, __libc_exit_fini);
 
 _Noreturn void exit(int code)
 {
+	/* Handle potentially concurrent or recursive calls to exit,
+	 * whose behaviors have traditionally been undefined by the
+	 * standards. Using a custom lock here avoids pulling in lock
+	 * machinery and lets us trap recursive calls while supporting
+	 * multiple threads contending to be the one to exit(). */
+	static volatile int exit_lock[1];
+	int tid =  __pthread_self()->tid;
+	int prev = a_cas(exit_lock, 0, tid);
+	if (prev == tid) a_crash();
+	else if (prev) for (;;) __sys_pause();
+
 	__funcs_on_exit();
 	__libc_exit_fini();
 	__stdio_exit();
diff --git a/src/fcntl/creat.c b/src/fcntl/creat.c
index 8f8aab64..c9c43910 100644
--- a/src/fcntl/creat.c
+++ b/src/fcntl/creat.c
@@ -4,5 +4,3 @@ int creat(const char *filename, mode_t mode)
 {
 	return open(filename, O_CREAT|O_WRONLY|O_TRUNC, mode);
 }
-
-weak_alias(creat, creat64);
diff --git a/src/fcntl/open.c b/src/fcntl/open.c
index 1d817a2d..4c3c8275 100644
--- a/src/fcntl/open.c
+++ b/src/fcntl/open.c
@@ -19,5 +19,3 @@ int open(const char *filename, int flags, ...)
 
 	return __syscall_ret(fd);
 }
-
-weak_alias(open, open64);
diff --git a/src/fcntl/openat.c b/src/fcntl/openat.c
index ad165ec3..83a9e0d0 100644
--- a/src/fcntl/openat.c
+++ b/src/fcntl/openat.c
@@ -15,5 +15,3 @@ int openat(int fd, const char *filename, int flags, ...)
 
 	return syscall_cp(SYS_openat, fd, filename, flags|O_LARGEFILE, mode);
 }
-
-weak_alias(openat, openat64);
diff --git a/src/fcntl/posix_fadvise.c b/src/fcntl/posix_fadvise.c
index 75b8e1ae..07346d21 100644
--- a/src/fcntl/posix_fadvise.c
+++ b/src/fcntl/posix_fadvise.c
@@ -14,5 +14,3 @@ int posix_fadvise(int fd, off_t base, off_t len, int advice)
 		__SYSCALL_LL_E(len), advice);
 #endif
 }
-
-weak_alias(posix_fadvise, posix_fadvise64);
diff --git a/src/fcntl/posix_fallocate.c b/src/fcntl/posix_fallocate.c
index c57a24ae..80a65cbf 100644
--- a/src/fcntl/posix_fallocate.c
+++ b/src/fcntl/posix_fallocate.c
@@ -6,5 +6,3 @@ int posix_fallocate(int fd, off_t base, off_t len)
 	return -__syscall(SYS_fallocate, fd, 0, __SYSCALL_LL_E(base),
 		__SYSCALL_LL_E(len));
 }
-
-weak_alias(posix_fallocate, posix_fallocate64);
diff --git a/src/fenv/loongarch64/fenv.S b/src/fenv/loongarch64/fenv.S
new file mode 100644
index 00000000..9c38599e
--- /dev/null
+++ b/src/fenv/loongarch64/fenv.S
@@ -0,0 +1,78 @@
+#ifndef __loongarch_soft_float
+
+#ifdef BROKEN_LOONGARCH_FCSR_ASM
+#define FCSR $r0
+#else
+#define FCSR $fcsr0
+#endif
+
+.global feclearexcept
+.type   feclearexcept,@function
+feclearexcept:
+	li.w    $t0, 0x1f0000
+	and     $a0, $a0, $t0
+	movfcsr2gr $t1, FCSR
+	andn    $t1, $t1, $a0
+	movgr2fcsr FCSR, $t1
+	li.w    $a0, 0
+	jr      $ra
+
+.global feraiseexcept
+.type   feraiseexcept,@function
+feraiseexcept:
+	li.w    $t0, 0x1f0000
+	and     $a0, $a0, $t0
+	movfcsr2gr $t1, FCSR
+	or      $t1, $t1, $a0
+	movgr2fcsr FCSR, $t1
+	li.w    $a0, 0
+	jr      $ra
+
+.global fetestexcept
+.type   fetestexcept,@function
+fetestexcept:
+	li.w    $t0, 0x1f0000
+	and     $a0, $a0, $t0
+	movfcsr2gr $t1, FCSR
+	and     $a0, $t1, $a0
+	jr      $ra
+
+.global fegetround
+.type   fegetround,@function
+fegetround:
+	movfcsr2gr $t0, FCSR
+	andi    $a0, $t0, 0x300
+	jr      $ra
+
+.global __fesetround
+.hidden __fesetround
+.type   __fesetround,@function
+__fesetround:
+	li.w    $t0, 0x300
+	and     $a0, $a0, $t0
+	movfcsr2gr $t1, FCSR
+	andn    $t1, $t1, $t0
+	or      $t1, $t1, $a0
+	movgr2fcsr FCSR, $t1
+	li.w    $a0, 0
+	jr      $ra
+
+.global fegetenv
+.type   fegetenv,@function
+fegetenv:
+	movfcsr2gr $t0, FCSR
+	st.w    $t0, $a0, 0
+	li.w    $a0, 0
+	jr      $ra
+
+.global fesetenv
+.type   fesetenv,@function
+fesetenv:
+	addi.d  $t0, $a0, 1
+	beq     $t0, $r0, 1f
+	ld.w    $t0, $a0, 0
+1:	movgr2fcsr FCSR, $t0
+	li.w    $a0, 0
+	jr      $ra
+
+#endif
diff --git a/src/fenv/powerpc/fenv-sf.c b/src/fenv/powerpc/fenv-sf.c
index 85bef40f..d4248f26 100644
--- a/src/fenv/powerpc/fenv-sf.c
+++ b/src/fenv/powerpc/fenv-sf.c
@@ -1,3 +1,3 @@
-#ifdef _SOFT_FLOAT
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)
 #include "../fenv.c"
 #endif
diff --git a/src/fenv/powerpc/fenv.S b/src/fenv/powerpc/fenv.S
index 22cea216..55055d0b 100644
--- a/src/fenv/powerpc/fenv.S
+++ b/src/fenv/powerpc/fenv.S
@@ -1,4 +1,4 @@
-#ifndef _SOFT_FLOAT
+#if !defined(_SOFT_FLOAT) && !defined(__NO_FPRS__)
 .global feclearexcept
 .type feclearexcept,@function
 feclearexcept:
diff --git a/src/fenv/riscv32/fenv-sf.c b/src/fenv/riscv32/fenv-sf.c
new file mode 100644
index 00000000..ecd3cb5c
--- /dev/null
+++ b/src/fenv/riscv32/fenv-sf.c
@@ -0,0 +1,3 @@
+#ifndef __riscv_flen
+#include "../fenv.c"
+#endif
diff --git a/src/fenv/riscv32/fenv.S b/src/fenv/riscv32/fenv.S
new file mode 100644
index 00000000..0ea78bf9
--- /dev/null
+++ b/src/fenv/riscv32/fenv.S
@@ -0,0 +1,56 @@
+#ifdef __riscv_flen
+
+.global feclearexcept
+.type feclearexcept, %function
+feclearexcept:
+	csrc fflags, a0
+	li a0, 0
+	ret
+
+.global feraiseexcept
+.type feraiseexcept, %function
+feraiseexcept:
+	csrs fflags, a0
+	li a0, 0
+	ret
+
+.global fetestexcept
+.type fetestexcept, %function
+fetestexcept:
+	frflags t0
+	and a0, t0, a0
+	ret
+
+.global fegetround
+.type fegetround, %function
+fegetround:
+	frrm a0
+	ret
+
+.global __fesetround
+.type __fesetround, %function
+__fesetround:
+	fsrm t0, a0
+	li a0, 0
+	ret
+
+.global fegetenv
+.type fegetenv, %function
+fegetenv:
+	frcsr t0
+	sw t0, 0(a0)
+	li a0, 0
+	ret
+
+.global fesetenv
+.type fesetenv, %function
+fesetenv:
+	li t2, -1
+	li t1, 0
+	beq a0, t2, 1f
+	lw t1, 0(a0)
+1:	fscsr t1
+	li a0, 0
+	ret
+
+#endif
diff --git a/src/fenv/sh/fenv.S b/src/fenv/sh/fenv.S
index 907aefc0..b3b7d66a 100644
--- a/src/fenv/sh/fenv.S
+++ b/src/fenv/sh/fenv.S
@@ -12,6 +12,8 @@ fegetround:
 .type   __fesetround, @function
 __fesetround:
 	sts fpscr, r0
+	mov #-4, r1
+	and r1, r0
 	or  r4, r0
 	lds r0, fpscr
 	rts
diff --git a/src/include/stdlib.h b/src/include/stdlib.h
index d38a5417..812b04de 100644
--- a/src/include/stdlib.h
+++ b/src/include/stdlib.h
@@ -8,5 +8,12 @@ hidden void __env_rm_add(char *, char *);
 hidden int __mkostemps(char *, int, int);
 hidden int __ptsname_r(int, char *, size_t);
 hidden char *__randname(char *);
+hidden void __qsort_r (void *, size_t, size_t, int (*)(const void *, const void *, void *), void *);
+
+hidden void *__libc_malloc(size_t);
+hidden void *__libc_malloc_impl(size_t);
+hidden void *__libc_calloc(size_t, size_t);
+hidden void *__libc_realloc(void *, size_t);
+hidden void __libc_free(void *);
 
 #endif
diff --git a/src/include/sys/stat.h b/src/include/sys/stat.h
new file mode 100644
index 00000000..59339bee
--- /dev/null
+++ b/src/include/sys/stat.h
@@ -0,0 +1,9 @@
+#ifndef SYS_STAT_H
+#define SYS_STAT_H
+
+#include "../../../include/sys/stat.h"
+
+hidden int __fstat(int, struct stat *);
+hidden int __fstatat(int, const char *restrict, struct stat *restrict, int);
+
+#endif
diff --git a/src/include/unistd.h b/src/include/unistd.h
index 1b4605c7..7b52a924 100644
--- a/src/include/unistd.h
+++ b/src/include/unistd.h
@@ -8,7 +8,6 @@ extern char **__environ;
 hidden int __dup3(int, int, int);
 hidden int __mkostemps(char *, int, int);
 hidden int __execvpe(const char *, char *const *, char *const *);
-hidden int __aio_close(int);
 hidden off_t __lseek(int, off_t, int);
 
 #endif
diff --git a/src/internal/aio_impl.h b/src/internal/aio_impl.h
new file mode 100644
index 00000000..a8657665
--- /dev/null
+++ b/src/internal/aio_impl.h
@@ -0,0 +1,9 @@
+#ifndef AIO_IMPL_H
+#define AIO_IMPL_H
+
+extern hidden volatile int __aio_fut;
+
+extern hidden int __aio_close(int);
+extern hidden void __aio_atfork(int);
+
+#endif
diff --git a/src/internal/atomic.h b/src/internal/atomic.h
index f938879b..8f71c8cd 100644
--- a/src/internal/atomic.h
+++ b/src/internal/atomic.h
@@ -194,7 +194,7 @@ static inline void a_store(volatile int *p, int v)
 
 #ifndef a_barrier
 #define a_barrier a_barrier
-static void a_barrier()
+static inline void a_barrier()
 {
 	volatile int tmp = 0;
 	a_cas(&tmp, 0, 0);
@@ -315,4 +315,19 @@ static inline int a_clz_64(uint64_t x)
 }
 #endif
 
+#ifndef a_clz_32
+#define a_clz_32 a_clz_32
+static inline int a_clz_32(uint32_t x)
+{
+	x >>= 1;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+	x++;
+	return 31-a_ctz_32(x);
+}
+#endif
+
 #endif
diff --git a/src/internal/dynlink.h b/src/internal/dynlink.h
index 764e3a1a..40c743e2 100644
--- a/src/internal/dynlink.h
+++ b/src/internal/dynlink.h
@@ -73,6 +73,10 @@ struct fdpic_dummy_loadmap {
 #define DL_NOMMU_SUPPORT 0
 #endif
 
+#ifndef TLSDESC_BACKWARDS
+#define TLSDESC_BACKWARDS 0
+#endif
+
 #if !DL_FDPIC
 #define IS_RELATIVE(x,s) ( \
 	(R_TYPE(x) == REL_RELATIVE) || \
@@ -92,8 +96,12 @@ struct fdpic_dummy_loadmap {
 #define DT_DEBUG_INDIRECT 0
 #endif
 
+#ifndef DT_DEBUG_INDIRECT_REL
+#define DT_DEBUG_INDIRECT_REL 0
+#endif
+
 #define AUX_CNT 32
-#define DYN_CNT 32
+#define DYN_CNT 37
 
 typedef void (*stage2_func)(unsigned char *, size_t *);
 
@@ -105,4 +113,9 @@ hidden void __dl_vseterr(const char *, va_list);
 
 hidden ptrdiff_t __tlsdesc_static(), __tlsdesc_dynamic();
 
+hidden extern int __malloc_replaced;
+hidden extern int __aligned_alloc_replaced;
+hidden void __malloc_donate(char *, char *);
+hidden int __malloc_allzerop(void *);
+
 #endif
diff --git a/src/internal/emulate_wait4.c b/src/internal/emulate_wait4.c
new file mode 100644
index 00000000..f6303412
--- /dev/null
+++ b/src/internal/emulate_wait4.c
@@ -0,0 +1,55 @@
+#include <sys/wait.h>
+#include "syscall.h"
+
+#ifndef SYS_wait4
+hidden long __emulate_wait4(int pid, int *status, int options, void *kru, int cp)
+{
+	idtype_t t;
+	int r;
+	siginfo_t info;
+
+	info.si_pid = 0;
+	if (pid < -1) {
+		t = P_PGID;
+		pid = -pid;
+	} else if (pid == -1) {
+		t = P_ALL;
+	} else if (pid == 0) {
+		t = P_PGID;
+	} else {
+		t = P_PID;
+	}
+
+	if (cp) r = __syscall_cp(SYS_waitid, t, pid, &info, options|WEXITED, kru);
+	else r = __syscall(SYS_waitid, t, pid, &info, options|WEXITED, kru);
+
+	if (r<0) return r;
+
+	if (info.si_pid && status) {
+		int sw=0;
+		switch (info.si_code) {
+		case CLD_CONTINUED:
+			sw = 0xffff;
+			break;
+		case CLD_DUMPED:
+			sw = info.si_status&0x7f | 0x80;
+			break;
+		case CLD_EXITED:
+			sw = (info.si_status&0xff) << 8;
+			break;
+		case CLD_KILLED:
+			sw = info.si_status&0x7f;
+			break;
+		case CLD_STOPPED:
+		case CLD_TRAPPED:
+			/* see ptrace(2); the high bits of si_status can contain */
+			/* PTRACE_EVENT_ values which must be preserved */
+			sw = (info.si_status << 8) + 0x7f;
+			break;
+		}
+		*status = sw;
+	}
+
+	return info.si_pid;
+}
+#endif
diff --git a/src/internal/fork_impl.h b/src/internal/fork_impl.h
new file mode 100644
index 00000000..f995fce2
--- /dev/null
+++ b/src/internal/fork_impl.h
@@ -0,0 +1,21 @@
+#include <features.h>
+
+extern hidden volatile int *const __at_quick_exit_lockptr;
+extern hidden volatile int *const __atexit_lockptr;
+extern hidden volatile int *const __gettext_lockptr;
+extern hidden volatile int *const __locale_lockptr;
+extern hidden volatile int *const __random_lockptr;
+extern hidden volatile int *const __sem_open_lockptr;
+extern hidden volatile int *const __stdio_ofl_lockptr;
+extern hidden volatile int *const __syslog_lockptr;
+extern hidden volatile int *const __timezone_lockptr;
+
+extern hidden volatile int *const __bump_lockptr;
+
+extern hidden volatile int *const __vmlock_lockptr;
+
+hidden void __malloc_atfork(int);
+hidden void __ldso_atfork(int);
+hidden void __pthread_key_atfork(int);
+
+hidden void __post_Fork(int);
diff --git a/src/internal/ksigaction.h b/src/internal/ksigaction.h
index 8ebd5938..ef333f33 100644
--- a/src/internal/ksigaction.h
+++ b/src/internal/ksigaction.h
@@ -6,8 +6,13 @@
 struct k_sigaction {
 	void (*handler)(int);
 	unsigned long flags;
+#ifdef SA_RESTORER
 	void (*restorer)(void);
+#endif
 	unsigned mask[2];
+#ifndef SA_RESTORER
+	void *unused;
+#endif
 };
 
 hidden void __restore(), __restore_rt();
diff --git a/src/internal/libc.h b/src/internal/libc.h
index ac97dc7e..619bba86 100644
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -18,10 +18,11 @@ struct tls_module {
 };
 
 struct __libc {
-	int can_do_threads;
-	int threaded;
-	int secure;
-	volatile int threads_minus_1;
+	char can_do_threads;
+	char threaded;
+	char secure;
+	volatile signed char need_locks;
+	int threads_minus_1;
 	size_t *auxv;
 	struct tls_module *tls_head;
 	size_t tls_size, tls_align, tls_cnt;
diff --git a/src/internal/libm.h b/src/internal/libm.h
index b5bd26b8..72ad17d8 100644
--- a/src/internal/libm.h
+++ b/src/internal/libm.h
@@ -236,13 +236,13 @@ hidden int    __rem_pio2(double,double*);
 hidden double __sin(double,double,int);
 hidden double __cos(double,double);
 hidden double __tan(double,double,int);
-hidden double __expo2(double);
+hidden double __expo2(double,double);
 
 hidden int    __rem_pio2f(float,double*);
 hidden float  __sindf(double);
 hidden float  __cosdf(double);
 hidden float  __tandf(double,int);
-hidden float  __expo2f(float);
+hidden float  __expo2f(float,float);
 
 hidden int __rem_pio2l(long double, long double *);
 hidden long double __sinl(long double, long double, int);
@@ -267,5 +267,8 @@ hidden double __math_uflow(uint32_t);
 hidden double __math_oflow(uint32_t);
 hidden double __math_divzero(uint32_t);
 hidden double __math_invalid(double);
+#if LDBL_MANT_DIG != DBL_MANT_DIG
+hidden long double __math_invalidl(long double);
+#endif
 
 #endif
diff --git a/src/internal/locale_impl.h b/src/internal/locale_impl.h
index 741a71c4..4431a92e 100644
--- a/src/internal/locale_impl.h
+++ b/src/internal/locale_impl.h
@@ -15,6 +15,8 @@ struct __locale_map {
 	const struct __locale_map *next;
 };
 
+extern hidden volatile int __locale_lock[1];
+
 extern hidden const struct __locale_map __c_dot_utf8;
 extern hidden const struct __locale_struct __c_locale;
 extern hidden const struct __locale_struct __c_dot_utf8_locale;
diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h
index 5742dfc5..de2b9d8b 100644
--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -11,16 +11,25 @@
 #include "atomic.h"
 #include "futex.h"
 
+#include "pthread_arch.h"
+
 #define pthread __pthread
 
 struct pthread {
 	/* Part 1 -- these fields may be external or
 	 * internal (accessed via asm) ABI. Do not change. */
 	struct pthread *self;
+#ifndef TLS_ABOVE_TP
 	uintptr_t *dtv;
+#endif
 	struct pthread *prev, *next; /* non-ABI */
 	uintptr_t sysinfo;
-	uintptr_t canary, canary2;
+#ifndef TLS_ABOVE_TP
+#ifdef CANARY_PAD
+	uintptr_t canary_pad;
+#endif
+	uintptr_t canary;
+#endif
 
 	/* Part 2 -- implementation details, non-ABI. */
 	int tid;
@@ -43,6 +52,7 @@ struct pthread {
 		long off;
 		volatile void *volatile pending;
 	} robust_list;
+	int h_errno_val;
 	volatile int timer_id;
 	locale_t locale;
 	volatile int killlock[1];
@@ -51,21 +61,19 @@ struct pthread {
 
 	/* Part 3 -- the positions of these fields relative to
 	 * the end of the structure is external and internal ABI. */
-	uintptr_t canary_at_end;
-	uintptr_t *dtv_copy;
+#ifdef TLS_ABOVE_TP
+	uintptr_t canary;
+	uintptr_t *dtv;
+#endif
 };
 
 enum {
-	DT_EXITING = 0,
+	DT_EXITED = 0,
+	DT_EXITING,
 	DT_JOINABLE,
 	DT_DETACHED,
 };
 
-struct __timer {
-	int timerid;
-	pthread_t thread;
-};
-
 #define __SU (sizeof(size_t)/sizeof(int))
 
 #define _a_stacksize __u.__s[0]
@@ -98,16 +106,22 @@ struct __timer {
 #define _b_waiters2 __u.__vi[4]
 #define _b_inst __u.__p[3]
 
-#include "pthread_arch.h"
-
-#ifndef CANARY
-#define CANARY canary
+#ifndef TP_OFFSET
+#define TP_OFFSET 0
 #endif
 
 #ifndef DTP_OFFSET
 #define DTP_OFFSET 0
 #endif
 
+#ifdef TLS_ABOVE_TP
+#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + TP_OFFSET)
+#define __pthread_self() ((pthread_t)(__get_tp() - sizeof(struct __pthread) - TP_OFFSET))
+#else
+#define TP_ADJ(p) (p)
+#define __pthread_self() ((pthread_t)__get_tp())
+#endif
+
 #ifndef tls_mod_off_t
 #define tls_mod_off_t size_t
 #endif
@@ -141,7 +155,6 @@ hidden int __pthread_key_delete_impl(pthread_key_t);
 
 extern hidden volatile size_t __pthread_tsd_size;
 extern hidden void *__pthread_tsd_main[];
-extern hidden volatile int __aio_fut;
 extern hidden volatile int __eintr_valid_flag;
 
 hidden int __clone(int (*)(void *), void *, int, void *, ...);
@@ -176,6 +189,8 @@ hidden void __tl_sync(pthread_t);
 
 extern hidden volatile int __thread_list_lock;
 
+extern hidden volatile int __abort_lock[1];
+
 extern hidden unsigned __default_stacksize;
 extern hidden unsigned __default_guardsize;
 
diff --git a/src/internal/shgetc.c b/src/internal/shgetc.c
index a4a9c633..7455d2f0 100644
--- a/src/internal/shgetc.c
+++ b/src/internal/shgetc.c
@@ -32,6 +32,6 @@ int __shgetc(FILE *f)
 	else
 		f->shend = f->rend;
 	f->shcnt = f->buf - f->rpos + cnt;
-	if (f->rpos[-1] != c) f->rpos[-1] = c;
+	if (f->rpos <= f->buf) f->rpos[-1] = c;
 	return c;
 }
diff --git a/src/internal/stdio_impl.h b/src/internal/stdio_impl.h
index d7398f59..0b2438d6 100644
--- a/src/internal/stdio_impl.h
+++ b/src/internal/stdio_impl.h
@@ -60,8 +60,6 @@ hidden size_t __stdout_write(FILE *, const unsigned char *, size_t);
 hidden off_t __stdio_seek(FILE *, off_t, int);
 hidden int __stdio_close(FILE *);
 
-hidden size_t __string_read(FILE *, unsigned char *, size_t);
-
 hidden int __toread(FILE *);
 hidden int __towrite(FILE *);
 
diff --git a/src/internal/syscall.h b/src/internal/syscall.h
index 975a0031..2d8a5c13 100644
--- a/src/internal/syscall.h
+++ b/src/internal/syscall.h
@@ -2,6 +2,7 @@
 #define _INTERNAL_SYSCALL_H
 
 #include <features.h>
+#include <errno.h>
 #include <sys/syscall.h>
 #include "syscall_arch.h"
 
@@ -57,15 +58,22 @@ hidden long __syscall_ret(unsigned long),
 #define __syscall_cp(...) __SYSCALL_DISP(__syscall_cp,__VA_ARGS__)
 #define syscall_cp(...) __syscall_ret(__syscall_cp(__VA_ARGS__))
 
-#ifndef SYSCALL_USE_SOCKETCALL
-#define __socketcall(nm,a,b,c,d,e,f) __syscall(SYS_##nm, a, b, c, d, e, f)
-#define __socketcall_cp(nm,a,b,c,d,e,f) __syscall_cp(SYS_##nm, a, b, c, d, e, f)
-#else
-#define __socketcall(nm,a,b,c,d,e,f) __syscall(SYS_socketcall, __SC_##nm, \
-    ((long [6]){ (long)a, (long)b, (long)c, (long)d, (long)e, (long)f }))
-#define __socketcall_cp(nm,a,b,c,d,e,f) __syscall_cp(SYS_socketcall, __SC_##nm, \
-    ((long [6]){ (long)a, (long)b, (long)c, (long)d, (long)e, (long)f }))
-#endif
+static inline long __alt_socketcall(int sys, int sock, int cp, syscall_arg_t a, syscall_arg_t b, syscall_arg_t c, syscall_arg_t d, syscall_arg_t e, syscall_arg_t f)
+{
+	long r;
+	if (cp) r = __syscall_cp(sys, a, b, c, d, e, f);
+	else r = __syscall(sys, a, b, c, d, e, f);
+	if (r != -ENOSYS) return r;
+#ifdef SYS_socketcall
+	if (cp) r = __syscall_cp(SYS_socketcall, sock, ((long[6]){a, b, c, d, e, f}));
+	else r = __syscall(SYS_socketcall, sock, ((long[6]){a, b, c, d, e, f}));
+#endif
+	return r;
+}
+#define __socketcall(nm, a, b, c, d, e, f) __alt_socketcall(SYS_##nm, __SC_##nm, 0, \
+	__scc(a), __scc(b), __scc(c), __scc(d), __scc(e), __scc(f))
+#define __socketcall_cp(nm, a, b, c, d, e, f) __alt_socketcall(SYS_##nm, __SC_##nm, 1, \
+	__scc(a), __scc(b), __scc(c), __scc(d), __scc(e), __scc(f))
 
 /* fixup legacy 16-bit junk */
 
@@ -193,43 +201,43 @@ hidden long __syscall_ret(unsigned long),
 #define SYS_sendfile SYS_sendfile64
 #endif
 
-#ifndef SYS_timer_settime
+#ifdef SYS_timer_settime32
 #define SYS_timer_settime SYS_timer_settime32
 #endif
 
-#ifndef SYS_timer_gettime
+#ifdef SYS_timer_gettime32
 #define SYS_timer_gettime SYS_timer_gettime32
 #endif
 
-#ifndef SYS_timerfd_settime
+#ifdef SYS_timerfd_settime32
 #define SYS_timerfd_settime SYS_timerfd_settime32
 #endif
 
-#ifndef SYS_timerfd_gettime
+#ifdef SYS_timerfd_gettime32
 #define SYS_timerfd_gettime SYS_timerfd_gettime32
 #endif
 
-#ifndef SYS_clock_settime
+#ifdef SYS_clock_settime32
 #define SYS_clock_settime SYS_clock_settime32
 #endif
 
-#ifndef SYS_clock_gettime
+#ifdef SYS_clock_gettime32
 #define SYS_clock_gettime SYS_clock_gettime32
 #endif
 
-#ifndef SYS_clock_getres
+#ifdef SYS_clock_getres_time32
 #define SYS_clock_getres SYS_clock_getres_time32
 #endif
 
-#ifndef SYS_clock_nanosleep
+#ifdef SYS_clock_nanosleep_time32
 #define SYS_clock_nanosleep SYS_clock_nanosleep_time32
 #endif
 
-#ifndef SYS_gettimeofday
+#ifdef SYS_gettimeofday_time32
 #define SYS_gettimeofday SYS_gettimeofday_time32
 #endif
 
-#ifndef SYS_settimeofday
+#ifdef SYS_settimeofday_time32
 #define SYS_settimeofday SYS_settimeofday_time32
 #endif
 
@@ -338,6 +346,12 @@ hidden long __syscall_ret(unsigned long),
 #define __SC_recvmmsg    19
 #define __SC_sendmmsg    20
 
+/* This is valid only because all socket syscalls are made via
+ * socketcall, which always fills unused argument slots with zeros. */
+#ifndef SYS_accept
+#define SYS_accept SYS_accept4
+#endif
+
 #ifndef SO_RCVTIMEO_OLD
 #define SO_RCVTIMEO_OLD  20
 #endif
@@ -377,6 +391,29 @@ hidden long __syscall_ret(unsigned long),
 #define __sys_open_cp(...) __SYSCALL_DISP(__sys_open_cp,,__VA_ARGS__)
 #define sys_open_cp(...) __syscall_ret(__sys_open_cp(__VA_ARGS__))
 
+#ifdef SYS_pause
+#define __sys_pause() __syscall(SYS_pause)
+#define __sys_pause_cp() __syscall_cp(SYS_pause)
+#else
+#define __sys_pause() __syscall(SYS_ppoll, 0, 0, 0, 0)
+#define __sys_pause_cp() __syscall_cp(SYS_ppoll, 0, 0, 0, 0)
+#endif
+
+#define sys_pause() __syscall_ret(__sys_pause())
+#define sys_pause_cp() __syscall_ret(__sys_pause_cp())
+
+#ifdef SYS_wait4
+#define __sys_wait4(a,b,c,d) __syscall(SYS_wait4,a,b,c,d)
+#define __sys_wait4_cp(a,b,c,d) __syscall_cp(SYS_wait4,a,b,c,d)
+#else
+hidden long __emulate_wait4(int, int *, int, void *, int);
+#define __sys_wait4(a,b,c,d) __emulate_wait4(a,b,c,d,0)
+#define __sys_wait4_cp(a,b,c,d) __emulate_wait4(a,b,c,d,1)
+#endif
+
+#define sys_wait4(a,b,c,d) __syscall_ret(__sys_wait4(a,b,c,d))
+#define sys_wait4_cp(a,b,c,d) __syscall_ret(__sys_wait4_cp(a,b,c,d))
+
 hidden void __procfdname(char __buf[static 15+3*sizeof(int)], unsigned);
 
 hidden void *__vdsosym(const char *, const char *);
diff --git a/src/ipc/msgctl.c b/src/ipc/msgctl.c
index b043041a..9c114406 100644
--- a/src/ipc/msgctl.c
+++ b/src/ipc/msgctl.c
@@ -9,6 +9,14 @@
 
 int msgctl(int q, int cmd, struct msqid_ds *buf)
 {
+#if IPC_TIME64
+	struct msqid_ds out, *orig;
+	if (cmd&IPC_TIME64) {
+		out = (struct msqid_ds){0};
+		orig = buf;
+		buf = &out;
+	}
+#endif
 #ifdef SYSCALL_IPC_BROKEN_MODE
 	struct msqid_ds tmp;
 	if (cmd == IPC_SET) {
@@ -32,6 +40,8 @@ int msgctl(int q, int cmd, struct msqid_ds *buf)
 #endif
 #if IPC_TIME64
 	if (r >= 0 && (cmd&IPC_TIME64)) {
+		buf = orig;
+		*buf = out;
 		IPC_HILO(buf, msg_stime);
 		IPC_HILO(buf, msg_rtime);
 		IPC_HILO(buf, msg_ctime);
diff --git a/src/ipc/semctl.c b/src/ipc/semctl.c
index ed982747..bbb97d7a 100644
--- a/src/ipc/semctl.c
+++ b/src/ipc/semctl.c
@@ -28,6 +28,14 @@ int semctl(int id, int num, int cmd, ...)
 		arg = va_arg(ap, union semun);
 		va_end(ap);
 	}
+#if IPC_TIME64
+	struct semid_ds out, *orig;
+	if (cmd&IPC_TIME64) {
+		out = (struct semid_ds){0};
+		orig = arg.buf;
+		arg.buf = &out;
+	}
+#endif
 #ifdef SYSCALL_IPC_BROKEN_MODE
 	struct semid_ds tmp;
 	if (cmd == IPC_SET) {
@@ -51,6 +59,8 @@ int semctl(int id, int num, int cmd, ...)
 #endif
 #if IPC_TIME64
 	if (r >= 0 && (cmd&IPC_TIME64)) {
+		arg.buf = orig;
+		*arg.buf = out;
 		IPC_HILO(arg.buf, sem_otime);
 		IPC_HILO(arg.buf, sem_ctime);
 	}
diff --git a/src/ipc/semtimedop.c b/src/ipc/semtimedop.c
index 1632e7b0..a104af21 100644
--- a/src/ipc/semtimedop.c
+++ b/src/ipc/semtimedop.c
@@ -7,7 +7,8 @@
 #define IS32BIT(x) !((x)+0x80000000ULL>>32)
 #define CLAMP(x) (int)(IS32BIT(x) ? (x) : 0x7fffffffU+((0ULL+(x))>>63))
 
-#if !defined(SYS_semtimedop) && !defined(SYS_ipc)
+#if !defined(SYS_semtimedop) && !defined(SYS_ipc) || \
+	SYS_semtimedop == SYS_semtimedop_time64
 #define NO_TIME32 1
 #else
 #define NO_TIME32 0
diff --git a/src/ipc/shmctl.c b/src/ipc/shmctl.c
index de3ce9d4..1c9f78c2 100644
--- a/src/ipc/shmctl.c
+++ b/src/ipc/shmctl.c
@@ -9,6 +9,14 @@
 
 int shmctl(int id, int cmd, struct shmid_ds *buf)
 {
+#if IPC_TIME64
+	struct shmid_ds out, *orig;
+	if (cmd&IPC_TIME64) {
+		out = (struct shmid_ds){0};
+		orig = buf;
+		buf = &out;
+	}
+#endif
 #ifdef SYSCALL_IPC_BROKEN_MODE
 	struct shmid_ds tmp;
 	if (cmd == IPC_SET) {
@@ -32,6 +40,8 @@ int shmctl(int id, int cmd, struct shmid_ds *buf)
 #endif
 #if IPC_TIME64
 	if (r >= 0 && (cmd&IPC_TIME64)) {
+		buf = orig;
+		*buf = out;
 		IPC_HILO(buf, shm_atime);
 		IPC_HILO(buf, shm_dtime);
 		IPC_HILO(buf, shm_ctime);
diff --git a/src/ldso/dl_iterate_phdr.c b/src/ldso/dl_iterate_phdr.c
index 86c87ef8..9546dd36 100644
--- a/src/ldso/dl_iterate_phdr.c
+++ b/src/ldso/dl_iterate_phdr.c
@@ -1,5 +1,6 @@
 #include <elf.h>
 #include <link.h>
+#include "pthread_impl.h"
 #include "libc.h"
 
 #define AUX_CNT 38
@@ -35,7 +36,7 @@ static int static_dl_iterate_phdr(int(*callback)(struct dl_phdr_info *info, size
 	info.dlpi_subs  = 0;
 	if (tls_phdr) {
 		info.dlpi_tls_modid = 1;
-		info.dlpi_tls_data = (void *)(base + tls_phdr->p_vaddr);
+		info.dlpi_tls_data = __tls_get_addr((tls_mod_off_t[]){1,0});
 	} else {
 		info.dlpi_tls_modid = 0;
 		info.dlpi_tls_data = 0;
diff --git a/src/ldso/dlerror.c b/src/ldso/dlerror.c
index 3fcc7779..dae0f3a9 100644
--- a/src/ldso/dlerror.c
+++ b/src/ldso/dlerror.c
@@ -3,7 +3,12 @@
 #include <stdarg.h>
 #include "pthread_impl.h"
 #include "dynlink.h"
-#include "lock.h"
+#include "atomic.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc __libc_realloc
+#define free __libc_free
 
 char *dlerror()
 {
@@ -17,30 +22,37 @@ char *dlerror()
 		return s;
 }
 
-static volatile int freebuf_queue_lock[1];
-static void **freebuf_queue;
+/* Atomic singly-linked list, used to store list of thread-local dlerror
+ * buffers for deferred free. They cannot be freed at thread exit time
+ * because, by the time it's known they can be freed, the exiting thread
+ * is in a highly restrictive context where it cannot call (even the
+ * libc-internal) free. It also can't take locks; thus the atomic list. */
+
+static void *volatile freebuf_queue;
 
 void __dl_thread_cleanup(void)
 {
 	pthread_t self = __pthread_self();
-	if (self->dlerror_buf && self->dlerror_buf != (void *)-1) {
-		LOCK(freebuf_queue_lock);
-		void **p = (void **)self->dlerror_buf;
-		*p = freebuf_queue;
-		freebuf_queue = p;
-		UNLOCK(freebuf_queue_lock);
-	}
+	if (!self->dlerror_buf || self->dlerror_buf == (void *)-1)
+		return;
+	void *h;
+	do {
+		h = freebuf_queue;
+		*(void **)self->dlerror_buf = h;
+	} while (a_cas_p(&freebuf_queue, h, self->dlerror_buf) != h);
 }
 
 hidden void __dl_vseterr(const char *fmt, va_list ap)
 {
-	LOCK(freebuf_queue_lock);
-	while (freebuf_queue) {
-		void **p = freebuf_queue;
-		freebuf_queue = *p;
-		free(p);
+	void **q;
+	do q = freebuf_queue;
+	while (q && a_cas_p(&freebuf_queue, q, 0) != q);
+
+	while (q) {
+		void **p = *q;
+		free(q);
+		q = p;
 	}
-	UNLOCK(freebuf_queue_lock);
 
 	va_list ap2;
 	va_copy(ap2, ap);
diff --git a/src/ldso/loongarch64/dlsym.s b/src/ldso/loongarch64/dlsym.s
new file mode 100644
index 00000000..26fabcdb
--- /dev/null
+++ b/src/ldso/loongarch64/dlsym.s
@@ -0,0 +1,7 @@
+.global dlsym
+.hidden __dlsym
+.type   dlsym,@function
+dlsym:
+	move      $a2, $ra
+	la.global $t0, __dlsym
+	jr        $t0
diff --git a/src/ldso/loongarch64/tlsdesc.s b/src/ldso/loongarch64/tlsdesc.s
new file mode 100644
index 00000000..4b6ea0e5
--- /dev/null
+++ b/src/ldso/loongarch64/tlsdesc.s
@@ -0,0 +1,37 @@
+.text
+.global __tlsdesc_static
+.hidden __tlsdesc_static
+.type __tlsdesc_static,%function
+__tlsdesc_static:
+	ld.d $a0, $a0, 8
+	jr $ra
+# size_t __tlsdesc_dynamic(size_t *a)
+# {
+#      struct {size_t modidx,off;} *p = (void*)a[1];
+#      size_t *dtv = *(size_t**)(tp - 8);
+#      return dtv[p->modidx] + p->off - tp;
+# }
+.global __tlsdesc_dynamic
+.hidden __tlsdesc_dynamic
+.type __tlsdesc_dynamic,%function
+__tlsdesc_dynamic:
+	addi.d $sp, $sp, -16
+	st.d $t1, $sp, 0
+	st.d $t2, $sp, 8
+
+	ld.d $t2, $tp, -8 # t2=dtv
+
+	ld.d $a0, $a0, 8  # a0=&{modidx,off}
+	ld.d $t1, $a0, 8  # t1=off
+	ld.d $a0, $a0, 0  # a0=modidx
+	slli.d $a0, $a0, 3  # a0=8*modidx
+
+	add.d $a0, $a0, $t2  # a0=dtv+8*modidx
+	ld.d $a0, $a0, 0  # a0=dtv[modidx]
+	add.d $a0, $a0, $t1 # a0=dtv[modidx]+off
+	sub.d $a0, $a0, $tp # a0=dtv[modidx]+off-tp
+
+	ld.d $t1, $sp, 0
+	ld.d $t2, $sp, 8
+	addi.d $sp, $sp, 16
+	jr $ra
diff --git a/src/ldso/riscv32/dlsym.s b/src/ldso/riscv32/dlsym.s
new file mode 100644
index 00000000..2bafd72d
--- /dev/null
+++ b/src/ldso/riscv32/dlsym.s
@@ -0,0 +1,6 @@
+.global dlsym
+.hidden __dlsym
+.type dlsym, %function
+dlsym:
+	mv a2, ra
+	tail __dlsym
diff --git a/src/ldso/riscv64/tlsdesc.s b/src/ldso/riscv64/tlsdesc.s
new file mode 100644
index 00000000..bef8b322
--- /dev/null
+++ b/src/ldso/riscv64/tlsdesc.s
@@ -0,0 +1,32 @@
+.text
+.global __tlsdesc_static
+.hidden __tlsdesc_static
+.type __tlsdesc_static,%function
+__tlsdesc_static:
+	ld a0,8(a0)
+	jr t0
+
+.global __tlsdesc_dynamic
+.hidden __tlsdesc_dynamic
+.type __tlsdesc_dynamic,%function
+__tlsdesc_dynamic:
+	add sp,sp,-16
+	sd t1,(sp)
+	sd t2,8(sp)
+
+	ld t2,-8(tp) # t2=dtv
+
+	ld a0,8(a0)  # a0=&{modidx,off}
+	ld t1,8(a0)  # t1=off
+	ld a0,(a0)   # a0=modidx
+	sll a0,a0,3  # a0=8*modidx
+
+	add a0,a0,t2 # a0=dtv+8*modidx
+	ld a0,(a0)   # a0=dtv[modidx]
+	add a0,a0,t1 # a0=dtv[modidx]+off
+	sub a0,a0,tp # a0=dtv[modidx]+off-tp
+
+	ld t1,(sp)
+	ld t2,8(sp)
+	add sp,sp,16
+	jr t0
diff --git a/src/ldso/sh/dlsym.s b/src/ldso/sh/dlsym.s
index 11a6fff5..34f3c35c 100644
--- a/src/ldso/sh/dlsym.s
+++ b/src/ldso/sh/dlsym.s
@@ -5,7 +5,7 @@
 dlsym:
 	mov.l L1, r0
 1:	braf  r0
-	 mov.l @r15, r6
+	 sts pr, r6
 
 .align 2
 L1:	.long __dlsym@PLT-(1b+4-.)
diff --git a/src/legacy/cuserid.c b/src/legacy/cuserid.c
index 4e78798d..dcaf73d4 100644
--- a/src/legacy/cuserid.c
+++ b/src/legacy/cuserid.c
@@ -2,13 +2,21 @@
 #include <pwd.h>
 #include <stdio.h>
 #include <unistd.h>
+#include <string.h>
 
 char *cuserid(char *buf)
 {
+	static char usridbuf[L_cuserid];
 	struct passwd pw, *ppw;
 	long pwb[256];
-	if (getpwuid_r(geteuid(), &pw, (void *)pwb, sizeof pwb, &ppw))
-		return 0;
-	snprintf(buf, L_cuserid, "%s", pw.pw_name);
+	if (buf) *buf = 0;
+	getpwuid_r(geteuid(), &pw, (void *)pwb, sizeof pwb, &ppw);
+	if (!ppw)
+		return buf;
+	size_t len = strnlen(pw.pw_name, L_cuserid);
+	if (len == L_cuserid)
+		return buf;
+	if (!buf) buf = usridbuf;
+	memcpy(buf, pw.pw_name, len+1);
 	return buf;
 }
diff --git a/src/legacy/ftw.c b/src/legacy/ftw.c
index 506bd29c..e757fc6f 100644
--- a/src/legacy/ftw.c
+++ b/src/legacy/ftw.c
@@ -7,5 +7,3 @@ int ftw(const char *path, int (*fn)(const char *, const struct stat *, int), int
 	 * actually undefined, but works on all real-world machines. */
 	return nftw(path, (int (*)())fn, fd_limit, FTW_PHYS);
 }
-
-weak_alias(ftw, ftw64);
diff --git a/src/legacy/getusershell.c b/src/legacy/getusershell.c
index 5fecdec2..1c5d98ec 100644
--- a/src/legacy/getusershell.c
+++ b/src/legacy/getusershell.c
@@ -25,8 +25,10 @@ char *getusershell(void)
 	ssize_t l;
 	if (!f) setusershell();
 	if (!f) return 0;
-	l = getline(&line, &linesize, f);
-	if (l <= 0) return 0;
+	do {
+		l = getline(&line, &linesize, f);
+		if (l <= 0) return 0;
+	} while (line[0] == '#' || line[0] == '\n');
 	if (line[l-1]=='\n') line[l-1]=0;
 	return line;
 }
diff --git a/src/legacy/lutimes.c b/src/legacy/lutimes.c
index 2e5502d1..dd465923 100644
--- a/src/legacy/lutimes.c
+++ b/src/legacy/lutimes.c
@@ -6,9 +6,11 @@
 int lutimes(const char *filename, const struct timeval tv[2])
 {
 	struct timespec times[2];
-	times[0].tv_sec  = tv[0].tv_sec;
-	times[0].tv_nsec = tv[0].tv_usec * 1000;
-	times[1].tv_sec  = tv[1].tv_sec;
-	times[1].tv_nsec = tv[1].tv_usec * 1000;
-	return utimensat(AT_FDCWD, filename, times, AT_SYMLINK_NOFOLLOW);
+	if (tv) {
+		times[0].tv_sec  = tv[0].tv_sec;
+		times[0].tv_nsec = tv[0].tv_usec * 1000;
+		times[1].tv_sec  = tv[1].tv_sec;
+		times[1].tv_nsec = tv[1].tv_usec * 1000;
+	}
+	return utimensat(AT_FDCWD, filename, tv ? times : 0, AT_SYMLINK_NOFOLLOW);
 }
diff --git a/src/linux/cache.c b/src/linux/cache.c
index 0eb051c2..e76f7812 100644
--- a/src/linux/cache.c
+++ b/src/linux/cache.c
@@ -21,7 +21,7 @@ weak_alias(__cachectl, cachectl);
 #ifdef SYS_riscv_flush_icache
 
 #define VDSO_FLUSH_ICACHE_SYM "__vdso_flush_icache"
-#define VDSO_FLUSH_ICACHE_VER "LINUX_4.5"
+#define VDSO_FLUSH_ICACHE_VER "LINUX_4.15"
 
 static void *volatile vdso_func;
 
@@ -45,6 +45,7 @@ int __riscv_flush_icache(void *start, void *end, unsigned long int flags)
 		if (!r) return r;
 		if (r != -ENOSYS) return __syscall_ret(r);
 	}
+	return syscall(SYS_riscv_flush_icache, start, end, flags);
 }
 weak_alias(__riscv_flush_icache, riscv_flush_icache);
 #endif
diff --git a/src/linux/clock_adjtime.c b/src/linux/clock_adjtime.c
index 23eb8729..d4d03d24 100644
--- a/src/linux/clock_adjtime.c
+++ b/src/linux/clock_adjtime.c
@@ -38,55 +38,52 @@ int clock_adjtime (clockid_t clock_id, struct timex *utx)
 {
 	int r = -ENOSYS;
 #ifdef SYS_clock_adjtime64
-	if (SYS_clock_adjtime == SYS_clock_adjtime64 ||
-	    (utx->modes & ADJ_SETOFFSET) && !IS32BIT(utx->time.tv_sec)) {
-		struct ktimex64 ktx = {
-			.modes = utx->modes,
-			.offset = utx->offset,
-			.freq = utx->freq,
-			.maxerror = utx->maxerror,
-			.esterror = utx->esterror,
-			.status = utx->status,
-			.constant = utx->constant,
-			.precision = utx->precision,
-			.tolerance = utx->tolerance,
-			.time_sec = utx->time.tv_sec,
-			.time_usec = utx->time.tv_usec,
-			.tick = utx->tick,
-			.ppsfreq = utx->ppsfreq,
-			.jitter = utx->jitter,
-			.shift = utx->shift,
-			.stabil = utx->stabil,
-			.jitcnt = utx->jitcnt,
-			.calcnt = utx->calcnt,
-			.errcnt = utx->errcnt,
-			.stbcnt = utx->stbcnt,
-			.tai = utx->tai,
-		};
-		r = __syscall(SYS_clock_adjtime, clock_id, &ktx);
-		if (r>=0) {
-			utx->modes = ktx.modes;
-			utx->offset = ktx.offset;
-			utx->freq = ktx.freq;
-			utx->maxerror = ktx.maxerror;
-			utx->esterror = ktx.esterror;
-			utx->status = ktx.status;
-			utx->constant = ktx.constant;
-			utx->precision = ktx.precision;
-			utx->tolerance = ktx.tolerance;
-			utx->time.tv_sec = ktx.time_sec;
-			utx->time.tv_usec = ktx.time_usec;
-			utx->tick = ktx.tick;
-			utx->ppsfreq = ktx.ppsfreq;
-			utx->jitter = ktx.jitter;
-			utx->shift = ktx.shift;
-			utx->stabil = ktx.stabil;
-			utx->jitcnt = ktx.jitcnt;
-			utx->calcnt = ktx.calcnt;
-			utx->errcnt = ktx.errcnt;
-			utx->stbcnt = ktx.stbcnt;
-			utx->tai = ktx.tai;
-		}
+	struct ktimex64 ktx = {
+		.modes = utx->modes,
+		.offset = utx->offset,
+		.freq = utx->freq,
+		.maxerror = utx->maxerror,
+		.esterror = utx->esterror,
+		.status = utx->status,
+		.constant = utx->constant,
+		.precision = utx->precision,
+		.tolerance = utx->tolerance,
+		.time_sec = utx->time.tv_sec,
+		.time_usec = utx->time.tv_usec,
+		.tick = utx->tick,
+		.ppsfreq = utx->ppsfreq,
+		.jitter = utx->jitter,
+		.shift = utx->shift,
+		.stabil = utx->stabil,
+		.jitcnt = utx->jitcnt,
+		.calcnt = utx->calcnt,
+		.errcnt = utx->errcnt,
+		.stbcnt = utx->stbcnt,
+		.tai = utx->tai,
+	};
+	r = __syscall(SYS_clock_adjtime64, clock_id, &ktx);
+	if (r>=0) {
+		utx->modes = ktx.modes;
+		utx->offset = ktx.offset;
+		utx->freq = ktx.freq;
+		utx->maxerror = ktx.maxerror;
+		utx->esterror = ktx.esterror;
+		utx->status = ktx.status;
+		utx->constant = ktx.constant;
+		utx->precision = ktx.precision;
+		utx->tolerance = ktx.tolerance;
+		utx->time.tv_sec = ktx.time_sec;
+		utx->time.tv_usec = ktx.time_usec;
+		utx->tick = ktx.tick;
+		utx->ppsfreq = ktx.ppsfreq;
+		utx->jitter = ktx.jitter;
+		utx->shift = ktx.shift;
+		utx->stabil = ktx.stabil;
+		utx->jitcnt = ktx.jitcnt;
+		utx->calcnt = ktx.calcnt;
+		utx->errcnt = ktx.errcnt;
+		utx->stbcnt = ktx.stbcnt;
+		utx->tai = ktx.tai;
 	}
 	if (SYS_clock_adjtime == SYS_clock_adjtime64 || r!=-ENOSYS)
 		return __syscall_ret(r);
diff --git a/src/linux/clone.c b/src/linux/clone.c
index 8c1af7d3..257c1cec 100644
--- a/src/linux/clone.c
+++ b/src/linux/clone.c
@@ -4,18 +4,62 @@
 #include <sched.h>
 #include "pthread_impl.h"
 #include "syscall.h"
+#include "lock.h"
+#include "fork_impl.h"
+
+struct clone_start_args {
+	int (*func)(void *);
+	void *arg;
+	sigset_t sigmask;
+};
+
+static int clone_start(void *arg)
+{
+	struct clone_start_args *csa = arg;
+	__post_Fork(0);
+	__restore_sigs(&csa->sigmask);
+	return csa->func(csa->arg);
+}
 
 int clone(int (*func)(void *), void *stack, int flags, void *arg, ...)
 {
+	struct clone_start_args csa;
 	va_list ap;
-	pid_t *ptid, *ctid;
-	void  *tls;
+	pid_t *ptid = 0, *ctid = 0;
+	void  *tls = 0;
+
+	/* Flags that produce an invalid thread/TLS state are disallowed. */
+	int badflags = CLONE_THREAD | CLONE_SETTLS | CLONE_CHILD_CLEARTID;
+
+	if ((flags & badflags) || !stack)
+		return __syscall_ret(-EINVAL);
 
 	va_start(ap, arg);
-	ptid = va_arg(ap, pid_t *);
-	tls  = va_arg(ap, void *);
-	ctid = va_arg(ap, pid_t *);
+	if (flags & (CLONE_PIDFD | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID))
+	 	ptid = va_arg(ap, pid_t *);
+	if (flags & CLONE_CHILD_SETTID) {
+		tls = va_arg(ap, void *);
+		ctid = va_arg(ap, pid_t *);
+	}
 	va_end(ap);
 
-	return __syscall_ret(__clone(func, stack, flags, arg, ptid, tls, ctid));
+	/* If CLONE_VM is used, it's impossible to give the child a consistent
+	 * thread structure. In this case, the best we can do is assume the
+	 * caller is content with an extremely restrictive execution context
+	 * like the one vfork() would provide. */
+	if (flags & CLONE_VM) return __syscall_ret(
+		__clone(func, stack, flags, arg, ptid, tls, ctid));
+
+	__block_all_sigs(&csa.sigmask);
+	LOCK(__abort_lock);
+
+	/* Setup the a wrapper start function for the child process to do
+	 * mimic _Fork in producing a consistent execution state. */
+	csa.func = func;
+	csa.arg = arg;
+	int ret = __clone(clone_start, stack, flags, &csa, ptid, tls, ctid);
+
+	__post_Fork(ret);
+	__restore_sigs(&csa.sigmask);
+	return __syscall_ret(ret);
 }
diff --git a/src/linux/epoll.c b/src/linux/epoll.c
index deff5b10..e56e8f4c 100644
--- a/src/linux/epoll.c
+++ b/src/linux/epoll.c
@@ -5,6 +5,7 @@
 
 int epoll_create(int size)
 {
+	if (size<=0) return __syscall_ret(-EINVAL);
 	return epoll_create1(0);
 }
 
@@ -24,9 +25,9 @@ int epoll_ctl(int fd, int op, int fd2, struct epoll_event *ev)
 
 int epoll_pwait(int fd, struct epoll_event *ev, int cnt, int to, const sigset_t *sigs)
 {
-	int r = __syscall(SYS_epoll_pwait, fd, ev, cnt, to, sigs, _NSIG/8);
+	int r = __syscall_cp(SYS_epoll_pwait, fd, ev, cnt, to, sigs, _NSIG/8);
 #ifdef SYS_epoll_wait
-	if (r==-ENOSYS && !sigs) r = __syscall(SYS_epoll_wait, fd, ev, cnt, to);
+	if (r==-ENOSYS && !sigs) r = __syscall_cp(SYS_epoll_wait, fd, ev, cnt, to);
 #endif
 	return __syscall_ret(r);
 }
diff --git a/src/linux/fallocate.c b/src/linux/fallocate.c
index 7d68bc8f..9146350e 100644
--- a/src/linux/fallocate.c
+++ b/src/linux/fallocate.c
@@ -7,6 +7,3 @@ int fallocate(int fd, int mode, off_t base, off_t len)
 	return syscall(SYS_fallocate, fd, mode, __SYSCALL_LL_E(base),
 		__SYSCALL_LL_E(len));
 }
-
-#undef fallocate64
-weak_alias(fallocate, fallocate64);
diff --git a/src/linux/getdents.c b/src/linux/getdents.c
index 796c1e5c..97f76e14 100644
--- a/src/linux/getdents.c
+++ b/src/linux/getdents.c
@@ -8,5 +8,3 @@ int getdents(int fd, struct dirent *buf, size_t len)
 	if (len>INT_MAX) len = INT_MAX;
 	return syscall(SYS_getdents, fd, buf, len);
 }
-
-weak_alias(getdents, getdents64);
diff --git a/src/linux/gettid.c b/src/linux/gettid.c
new file mode 100644
index 00000000..70767137
--- /dev/null
+++ b/src/linux/gettid.c
@@ -0,0 +1,8 @@
+#define _GNU_SOURCE
+#include <unistd.h>
+#include "pthread_impl.h"
+
+pid_t gettid(void)
+{
+	return __pthread_self()->tid;
+}
diff --git a/src/linux/membarrier.c b/src/linux/membarrier.c
index 9ebe906e..f64fe7e1 100644
--- a/src/linux/membarrier.c
+++ b/src/linux/membarrier.c
@@ -9,13 +9,8 @@ static void dummy_0(void)
 {
 }
 
-static void dummy_1(pthread_t t)
-{
-}
-
 weak_alias(dummy_0, __tl_lock);
 weak_alias(dummy_0, __tl_unlock);
-weak_alias(dummy_1, __tl_sync);
 
 static sem_t barrier_sem;
 
@@ -40,7 +35,7 @@ int __membarrier(int cmd, int flags)
 		__tl_lock();
 		sem_init(&barrier_sem, 0, 0);
 		struct sigaction sa = {
-			.sa_flags = SA_RESTART,
+			.sa_flags = SA_RESTART | SA_ONSTACK,
 			.sa_handler = bcast_barrier
 		};
 		memset(&sa.sa_mask, -1, sizeof sa.sa_mask);
diff --git a/src/linux/preadv2.c b/src/linux/preadv2.c
new file mode 100644
index 00000000..5e7ab70f
--- /dev/null
+++ b/src/linux/preadv2.c
@@ -0,0 +1,17 @@
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <unistd.h>
+#include "syscall.h"
+
+ssize_t preadv2(int fd, const struct iovec *iov, int count, off_t ofs, int flags)
+{
+#ifdef SYS_preadv
+	if (!flags) {
+		if (ofs==-1) return readv(fd, iov, count);
+		return syscall_cp(SYS_preadv, fd, iov, count,
+			(long)(ofs), (long)(ofs>>32));
+	}
+#endif
+	return syscall_cp(SYS_preadv2, fd, iov, count,
+		(long)(ofs), (long)(ofs>>32), flags);
+}
diff --git a/src/linux/prlimit.c b/src/linux/prlimit.c
index 3df9ffba..fcf45aab 100644
--- a/src/linux/prlimit.c
+++ b/src/linux/prlimit.c
@@ -21,6 +21,3 @@ int prlimit(pid_t pid, int resource, const struct rlimit *new_limit, struct rlim
 	}
 	return r;
 }
-
-#undef prlimit64
-weak_alias(prlimit, prlimit64);
diff --git a/src/linux/pwritev2.c b/src/linux/pwritev2.c
new file mode 100644
index 00000000..ece90d7c
--- /dev/null
+++ b/src/linux/pwritev2.c
@@ -0,0 +1,17 @@
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <unistd.h>
+#include "syscall.h"
+
+ssize_t pwritev2(int fd, const struct iovec *iov, int count, off_t ofs, int flags)
+{
+#ifdef SYS_pwritev
+	if (!flags) {
+		if (ofs==-1) return writev(fd, iov, count);
+		return syscall_cp(SYS_pwritev, fd, iov, count,
+			(long)(ofs), (long)(ofs>>32));
+	}
+#endif
+	return syscall_cp(SYS_pwritev2, fd, iov, count,
+		(long)(ofs), (long)(ofs>>32), flags);
+}
diff --git a/src/linux/renameat2.c b/src/linux/renameat2.c
new file mode 100644
index 00000000..b8060388
--- /dev/null
+++ b/src/linux/renameat2.c
@@ -0,0 +1,11 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include "syscall.h"
+
+int renameat2(int oldfd, const char *old, int newfd, const char *new, unsigned flags)
+{
+#ifdef SYS_renameat
+	if (!flags) return syscall(SYS_renameat, oldfd, old, newfd, new);
+#endif
+	return syscall(SYS_renameat2, oldfd, old, newfd, new, flags);
+}
diff --git a/src/linux/sendfile.c b/src/linux/sendfile.c
index 9afe6dd6..fc1577d3 100644
--- a/src/linux/sendfile.c
+++ b/src/linux/sendfile.c
@@ -5,5 +5,3 @@ ssize_t sendfile(int out_fd, int in_fd, off_t *ofs, size_t count)
 {
 	return syscall(SYS_sendfile, out_fd, in_fd, ofs, count);
 }
-
-weak_alias(sendfile, sendfile64);
diff --git a/src/linux/setgroups.c b/src/linux/setgroups.c
index 1248fdbf..47142f14 100644
--- a/src/linux/setgroups.c
+++ b/src/linux/setgroups.c
@@ -1,8 +1,36 @@
 #define _GNU_SOURCE
 #include <unistd.h>
+#include <signal.h>
 #include "syscall.h"
+#include "libc.h"
+
+struct ctx {
+	size_t count;
+	const gid_t *list;
+	int ret;
+};
+
+static void do_setgroups(void *p)
+{
+	struct ctx *c = p;
+	if (c->ret<0) return;
+	int ret = __syscall(SYS_setgroups, c->count, c->list);
+	if (ret && !c->ret) {
+		/* If one thread fails to set groups after another has already
+		 * succeeded, forcibly killing the process is the only safe
+		 * thing to do. State is inconsistent and dangerous. Use
+		 * SIGKILL because it is uncatchable. */
+		__block_all_sigs(0);
+		__syscall(SYS_kill, __syscall(SYS_getpid), SIGKILL);
+	}
+	c->ret = ret;
+}
 
 int setgroups(size_t count, const gid_t list[])
 {
-	return syscall(SYS_setgroups, count, list);
+	/* ret is initially nonzero so that failure of the first thread does not
+	 * trigger the safety kill above. */
+	struct ctx c = { .count = count, .list = list, .ret = 1 };
+	__synccall(do_setgroups, &c);
+	return __syscall_ret(c.ret);
 }
diff --git a/src/linux/statx.c b/src/linux/statx.c
new file mode 100644
index 00000000..4fb96e4b
--- /dev/null
+++ b/src/linux/statx.c
@@ -0,0 +1,44 @@
+#define _GNU_SOURCE
+#include <sys/stat.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/sysmacros.h>
+#include <errno.h>
+
+int statx(int dirfd, const char *restrict path, int flags, unsigned mask, struct statx *restrict stx)
+{
+	int ret = __syscall(SYS_statx, dirfd, path, flags, mask, stx);
+
+#ifndef SYS_fstatat
+	return __syscall_ret(ret);
+#endif
+
+	if (ret != -ENOSYS) return __syscall_ret(ret);
+
+	struct stat st;
+	ret = fstatat(dirfd, path, &st, flags);
+	if (ret) return ret;
+
+	*stx = (struct statx){0};
+	stx->stx_dev_major = major(st.st_dev);
+	stx->stx_dev_minor = minor(st.st_dev);
+	stx->stx_rdev_major = major(st.st_rdev);
+	stx->stx_rdev_minor = minor(st.st_rdev);
+	stx->stx_ino = st.st_ino;
+	stx->stx_mode = st.st_mode;
+	stx->stx_nlink = st.st_nlink;
+	stx->stx_uid = st.st_uid;
+	stx->stx_gid = st.st_gid;
+	stx->stx_size = st.st_size;
+	stx->stx_blksize = st.st_blksize;
+	stx->stx_blocks = st.st_blocks;
+	stx->stx_atime.tv_sec = st.st_atim.tv_sec;
+	stx->stx_atime.tv_nsec = st.st_atim.tv_nsec;
+	stx->stx_mtime.tv_sec = st.st_mtim.tv_sec;
+	stx->stx_mtime.tv_nsec = st.st_mtim.tv_nsec;
+	stx->stx_ctime.tv_sec = st.st_ctim.tv_sec;
+	stx->stx_ctime.tv_nsec = st.st_ctim.tv_nsec;
+	stx->stx_mask = STATX_BASIC_STATS;
+
+	return 0;
+}
diff --git a/src/linux/wait4.c b/src/linux/wait4.c
index 83650e34..fb08c0d0 100644
--- a/src/linux/wait4.c
+++ b/src/linux/wait4.c
@@ -12,7 +12,7 @@ pid_t wait4(pid_t pid, int *status, int options, struct rusage *ru)
 	if (ru) {
 		long long kru64[18];
 		r = __syscall(SYS_wait4_time64, pid, status, options, kru64);
-		if (!r) {
+		if (r > 0) {
 			ru->ru_utime = (struct timeval)
 				{ .tv_sec = kru64[0], .tv_usec = kru64[1] };
 			ru->ru_stime = (struct timeval)
@@ -26,7 +26,7 @@ pid_t wait4(pid_t pid, int *status, int options, struct rusage *ru)
 	}
 #endif
 	char *dest = ru ? (char *)&ru->ru_maxrss - 4*sizeof(long) : 0;
-	r = __syscall(SYS_wait4, pid, status, options, dest);
+	r = __sys_wait4(pid, status, options, dest);
 	if (r>0 && ru && sizeof(time_t) > sizeof(long)) {
 		long kru[4];
 		memcpy(kru, dest, 4*sizeof(long));
diff --git a/src/locale/bind_textdomain_codeset.c b/src/locale/bind_textdomain_codeset.c
index 5ebfd5e8..240e83ed 100644
--- a/src/locale/bind_textdomain_codeset.c
+++ b/src/locale/bind_textdomain_codeset.c
@@ -5,7 +5,9 @@
 
 char *bind_textdomain_codeset(const char *domainname, const char *codeset)
 {
-	if (codeset && strcasecmp(codeset, "UTF-8"))
+	if (codeset && strcasecmp(codeset, "UTF-8")) {
 		errno = EINVAL;
-	return NULL;
+		return 0;
+	}
+	return "UTF-8";
 }
diff --git a/src/locale/codepages.h b/src/locale/codepages.h
index 4e236ef3..a254f0f5 100644
--- a/src/locale/codepages.h
+++ b/src/locale/codepages.h
@@ -286,6 +286,17 @@
 "\323\174\103\215\64\365\124\123\213\77\336\150\263\115\66\375\164\363\12\55"
 "\255\304\42\261\57\266\234\162\17\56\260\240\162\113\56\263\310\62\66\50"
 
+"cp858\0"
+"\0\40"
+"\307\360\223\216\70\344\200\123\316\71\352\254\203\316\73\356\260\103\114\61"
+"\311\230\143\14\75\366\310\263\117\76\377\130\303\15\76\243\140\163\15\135"
+"\341\264\63\217\76\361\104\243\212\56\277\270\302\112\57\274\204\262\312\56"
+"\140\207\55\66\315\72\7\43\14\60\251\104\375\163\321\113\213\122\212\315"
+"\67\363\274\163\316\63\367\74\316\60\110\13\175\65\325\116\373\254\65\51"
+"\360\100\243\314\62\310\220\334\214\63\317\340\134\163\327\134\233\302\314\326"
+"\323\174\103\215\64\365\124\123\213\77\336\150\263\115\66\375\164\363\12\55"
+"\255\304\42\261\57\266\234\162\17\56\260\240\162\113\56\263\310\62\66\50"
+
 "cp866\0"
 "\0\40"
 "\337\201\27\236\170\343\221\127\236\171\347\241\227\236\172"
diff --git a/src/locale/dcngettext.c b/src/locale/dcngettext.c
index 4c304393..0b53286d 100644
--- a/src/locale/dcngettext.c
+++ b/src/locale/dcngettext.c
@@ -10,6 +10,12 @@
 #include "atomic.h"
 #include "pleval.h"
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc undef
+#define free undef
 
 struct binding {
 	struct binding *next;
@@ -34,9 +40,11 @@ static char *gettextdir(const char *domainname, size_t *dirlen)
 	return 0;
 }
 
+static volatile int lock[1];
+volatile int *const __gettext_lockptr = lock;
+
 char *bindtextdomain(const char *domainname, const char *dirname)
 {
-	static volatile int lock[1];
 	struct binding *p, *q;
 
 	if (!domainname) return 0;
@@ -124,6 +132,9 @@ char *dcngettext(const char *domainname, const char *msgid1, const char *msgid2,
 	struct binding *q;
 	int old_errno = errno;
 
+	/* match gnu gettext behaviour */
+	if (!msgid1) goto notrans;
+
 	if ((unsigned)category >= LC_ALL) goto notrans;
 
 	if (!domainname) domainname = __gettextdomain();
diff --git a/src/locale/duplocale.c b/src/locale/duplocale.c
index 030b64cb..5ce33ae6 100644
--- a/src/locale/duplocale.c
+++ b/src/locale/duplocale.c
@@ -3,6 +3,11 @@
 #include "locale_impl.h"
 #include "libc.h"
 
+#define malloc __libc_malloc
+#define calloc undef
+#define realloc undef
+#define free undef
+
 locale_t __duplocale(locale_t old)
 {
 	locale_t new = malloc(sizeof *new);
diff --git a/src/locale/freelocale.c b/src/locale/freelocale.c
index 802b8bfe..385d1206 100644
--- a/src/locale/freelocale.c
+++ b/src/locale/freelocale.c
@@ -1,6 +1,11 @@
 #include <stdlib.h>
 #include "locale_impl.h"
 
+#define malloc undef
+#define calloc undef
+#define realloc undef
+#define free __libc_free
+
 void freelocale(locale_t l)
 {
 	if (__loc_is_allocated(l)) free(l);
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index 3047c27b..52178950 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -49,10 +49,10 @@ static const unsigned char charmaps[] =
 "ucs4\0utf32\0\0\313"
 "ucs2\0\0\314"
 "eucjp\0\0\320"
-"shiftjis\0sjis\0\0\321"
+"shiftjis\0sjis\0cp932\0\0\321"
 "iso2022jp\0\0\322"
 "gb18030\0\0\330"
-"gbk\0\0\331"
+"gbk\0cp936\0windows936\0\0\331"
 "gb2312\0\0\332"
 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
@@ -339,7 +339,10 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			} else if (d-159 <= 252-159) {
 				c++;
 				d -= 159;
+			} else {
+				goto ilseq;
 			}
+			if (c>=84) goto ilseq;
 			c = jis0208[c][d];
 			if (!c) goto ilseq;
 			break;
@@ -403,6 +406,10 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			if (c < 128) break;
 			if (c < 0xa1) goto ilseq;
 		case GBK:
+			if (c == 128) {
+				c = 0x20ac;
+				break;
+			}
 		case GB18030:
 			if (c < 128) break;
 			c -= 0x81;
@@ -495,7 +502,7 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			if (c >= 93 || d >= 94) {
 				c += (0xa1-0x81);
 				d += 0xa1;
-				if (c >= 93 || c>=0xc6-0x81 && d>0x52)
+				if (c > 0xc6-0x81 || c==0xc6-0x81 && d>0x52)
 					goto ilseq;
 				if (d-'A'<26) d = d-'A';
 				else if (d-'a'<26) d = d-'a'+26;
@@ -538,6 +545,10 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 				if (*outb < k) goto toobig;
 				memcpy(*out, tmp, k);
 			} else k = wctomb_utf8(*out, c);
+			/* This failure condition should be unreachable, but
+			 * is included to prevent decoder bugs from translating
+			 * into advancement outside the output buffer range. */
+			if (k>4) goto ilseq;
 			*out += k;
 			*outb -= k;
 			break;
diff --git a/src/locale/locale_map.c b/src/locale/locale_map.c
index 2321bac0..da61f7fc 100644
--- a/src/locale/locale_map.c
+++ b/src/locale/locale_map.c
@@ -1,9 +1,16 @@
 #include <locale.h>
 #include <string.h>
 #include <sys/mman.h>
+#include <stdlib.h>
 #include "locale_impl.h"
 #include "libc.h"
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc undef
+#define realloc undef
+#define free undef
 
 const char *__lctrans_impl(const char *msg, const struct __locale_map *lm)
 {
@@ -21,9 +28,11 @@ static const char envvars[][12] = {
 	"LC_MESSAGES",
 };
 
+volatile int __locale_lock[1];
+volatile int *const __locale_lockptr = __locale_lock;
+
 const struct __locale_map *__get_locale(int cat, const char *val)
 {
-	static volatile int lock[1];
 	static void *volatile loc_head;
 	const struct __locale_map *p;
 	struct __locale_map *new = 0;
@@ -54,20 +63,12 @@ const struct __locale_map *__get_locale(int cat, const char *val)
 	for (p=loc_head; p; p=p->next)
 		if (!strcmp(val, p->name)) return p;
 
-	LOCK(lock);
-
-	for (p=loc_head; p; p=p->next)
-		if (!strcmp(val, p->name)) {
-			UNLOCK(lock);
-			return p;
-		}
-
 	if (!libc.secure) path = getenv("MUSL_LOCPATH");
 	/* FIXME: add a default path? */
 
 	if (path) for (; *path; path=z+!!*z) {
 		z = __strchrnul(path, ':');
-		l = z - path - !!*z;
+		l = z - path;
 		if (l >= sizeof buf - n - 2) continue;
 		memcpy(buf, path, l);
 		buf[l] = '/';
@@ -108,6 +109,5 @@ const struct __locale_map *__get_locale(int cat, const char *val)
 	 * requested name was "C" or "POSIX". */
 	if (!new && cat == LC_CTYPE) new = (void *)&__c_dot_utf8;
 
-	UNLOCK(lock);
 	return new;
 }
diff --git a/src/locale/newlocale.c b/src/locale/newlocale.c
index d20a8489..9ac3cd38 100644
--- a/src/locale/newlocale.c
+++ b/src/locale/newlocale.c
@@ -2,16 +2,15 @@
 #include <string.h>
 #include <pthread.h>
 #include "locale_impl.h"
+#include "lock.h"
 
-static pthread_once_t default_locale_once;
-static struct __locale_struct default_locale, default_ctype_locale;
+#define malloc __libc_malloc
+#define calloc undef
+#define realloc undef
+#define free undef
 
-static void default_locale_init(void)
-{
-	for (int i=0; i<LC_ALL; i++)
-		default_locale.cat[i] = __get_locale(i, "");
-	default_ctype_locale.cat[LC_CTYPE] = default_locale.cat[LC_CTYPE];
-}
+static int default_locale_init_done;
+static struct __locale_struct default_locale, default_ctype_locale;
 
 int __loc_is_allocated(locale_t loc)
 {
@@ -19,7 +18,7 @@ int __loc_is_allocated(locale_t loc)
 		&& loc != &default_locale && loc != &default_ctype_locale;
 }
 
-locale_t __newlocale(int mask, const char *name, locale_t loc)
+static locale_t do_newlocale(int mask, const char *name, locale_t loc)
 {
 	struct __locale_struct tmp;
 
@@ -44,7 +43,12 @@ locale_t __newlocale(int mask, const char *name, locale_t loc)
 
 	/* And provide builtins for the initial default locale, and a
 	 * variant of the C locale honoring the default locale's encoding. */
-	pthread_once(&default_locale_once, default_locale_init);
+	if (!default_locale_init_done) {
+		for (int i=0; i<LC_ALL; i++)
+			default_locale.cat[i] = __get_locale(i, "");
+		default_ctype_locale.cat[LC_CTYPE] = default_locale.cat[LC_CTYPE];
+		default_locale_init_done = 1;
+	}
 	if (!memcmp(&tmp, &default_locale, sizeof tmp)) return &default_locale;
 	if (!memcmp(&tmp, &default_ctype_locale, sizeof tmp))
 		return &default_ctype_locale;
@@ -55,4 +59,12 @@ locale_t __newlocale(int mask, const char *name, locale_t loc)
 	return loc;
 }
 
+locale_t __newlocale(int mask, const char *name, locale_t loc)
+{
+	LOCK(__locale_lock);
+	loc = do_newlocale(mask, name, loc);
+	UNLOCK(__locale_lock);
+	return loc;
+}
+
 weak_alias(__newlocale, newlocale);
diff --git a/src/locale/setlocale.c b/src/locale/setlocale.c
index 2bc7b500..360c4437 100644
--- a/src/locale/setlocale.c
+++ b/src/locale/setlocale.c
@@ -9,12 +9,11 @@ static char buf[LC_ALL*(LOCALE_NAME_MAX+1)];
 
 char *setlocale(int cat, const char *name)
 {
-	static volatile int lock[1];
 	const struct __locale_map *lm;
 
 	if ((unsigned)cat > LC_ALL) return 0;
 
-	LOCK(lock);
+	LOCK(__locale_lock);
 
 	/* For LC_ALL, setlocale is required to return a string which
 	 * encodes the current setting for all categories. The format of
@@ -36,7 +35,7 @@ char *setlocale(int cat, const char *name)
 				}
 				lm = __get_locale(i, part);
 				if (lm == LOC_MAP_FAILED) {
-					UNLOCK(lock);
+					UNLOCK(__locale_lock);
 					return 0;
 				}
 				tmp_locale.cat[i] = lm;
@@ -57,14 +56,14 @@ char *setlocale(int cat, const char *name)
 			s += l+1;
 		}
 		*--s = 0;
-		UNLOCK(lock);
+		UNLOCK(__locale_lock);
 		return same==LC_ALL ? (char *)part : buf;
 	}
 
 	if (name) {
 		lm = __get_locale(cat, name);
 		if (lm == LOC_MAP_FAILED) {
-			UNLOCK(lock);
+			UNLOCK(__locale_lock);
 			return 0;
 		}
 		libc.global_locale.cat[cat] = lm;
@@ -73,7 +72,7 @@ char *setlocale(int cat, const char *name)
 	}
 	char *ret = lm ? (char *)lm->name : "C";
 
-	UNLOCK(lock);
+	UNLOCK(__locale_lock);
 
 	return ret;
 }
diff --git a/src/locale/strtod_l.c b/src/locale/strtod_l.c
new file mode 100644
index 00000000..574ba148
--- /dev/null
+++ b/src/locale/strtod_l.c
@@ -0,0 +1,22 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <locale.h>
+
+float strtof_l(const char *restrict s, char **restrict p, locale_t l)
+{
+	return strtof(s, p);
+}
+
+double strtod_l(const char *restrict s, char **restrict p, locale_t l)
+{
+	return strtod(s, p);
+}
+
+long double strtold_l(const char *restrict s, char **restrict p, locale_t l)
+{
+	return strtold(s, p);
+}
+
+weak_alias(strtof_l, __strtof_l);
+weak_alias(strtod_l, __strtod_l);
+weak_alias(strtold_l, __strtold_l);
diff --git a/src/malloc/DESIGN b/src/malloc/DESIGN
deleted file mode 100644
index 58b0523f..00000000
--- a/src/malloc/DESIGN
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-In principle, this memory allocator is roughly equivalent to Doug
-Lea's dlmalloc with fine-grained locking.
-
-
-
-malloc:
-
-Uses a freelist binned by chunk size, with a bitmap to optimize
-searching for the smallest non-empty bin which can satisfy an
-allocation. If no free chunks are available, it creates a new chunk of
-the requested size and attempts to merge it with any existing free
-chunk immediately below the newly created chunk.
-
-Whether the chunk was obtained from a bin or newly created, it's
-likely to be larger than the requested allocation. malloc always
-finishes its work by passing the new chunk to realloc, which will
-split it into two chunks and free the tail portion.
-
-
-
diff --git a/src/malloc/aligned_alloc.c b/src/malloc/aligned_alloc.c
deleted file mode 100644
index b6143f30..00000000
--- a/src/malloc/aligned_alloc.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <stdlib.h>
-#include "malloc_impl.h"
-
-void *aligned_alloc(size_t align, size_t len)
-{
-	return __memalign(align, len);
-}
diff --git a/src/malloc/calloc.c b/src/malloc/calloc.c
new file mode 100644
index 00000000..bf6bddca
--- /dev/null
+++ b/src/malloc/calloc.c
@@ -0,0 +1,45 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include "dynlink.h"
+
+static size_t mal0_clear(char *p, size_t n)
+{
+	const size_t pagesz = 4096; /* arbitrary */
+	if (n < pagesz) return n;
+#ifdef __GNUC__
+	typedef uint64_t __attribute__((__may_alias__)) T;
+#else
+	typedef unsigned char T;
+#endif
+	char *pp = p + n;
+	size_t i = (uintptr_t)pp & (pagesz - 1);
+	for (;;) {
+		pp = memset(pp - i, 0, i);
+		if (pp - p < pagesz) return pp - p;
+		for (i = pagesz; i; i -= 2*sizeof(T), pp -= 2*sizeof(T))
+		        if (((T *)pp)[-1] | ((T *)pp)[-2])
+				break;
+	}
+}
+
+static int allzerop(void *p)
+{
+	return 0;
+}
+weak_alias(allzerop, __malloc_allzerop);
+
+void *calloc(size_t m, size_t n)
+{
+	if (n && m > (size_t)-1/n) {
+		errno = ENOMEM;
+		return 0;
+	}
+	n *= m;
+	void *p = malloc(n);
+	if (!p || (!__malloc_replaced && __malloc_allzerop(p)))
+		return p;
+	n = mal0_clear(p, n);
+	return memset(p, 0, n);
+}
diff --git a/src/malloc/expand_heap.c b/src/malloc/expand_heap.c
deleted file mode 100644
index e6a3d7a0..00000000
--- a/src/malloc/expand_heap.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <limits.h>
-#include <stdint.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include "libc.h"
-#include "syscall.h"
-#include "malloc_impl.h"
-
-/* This function returns true if the interval [old,new]
- * intersects the 'len'-sized interval below &libc.auxv
- * (interpreted as the main-thread stack) or below &b
- * (the current stack). It is used to defend against
- * buggy brk implementations that can cross the stack. */
-
-static int traverses_stack_p(uintptr_t old, uintptr_t new)
-{
-	const uintptr_t len = 8<<20;
-	uintptr_t a, b;
-
-	b = (uintptr_t)libc.auxv;
-	a = b > len ? b-len : 0;
-	if (new>a && old<b) return 1;
-
-	b = (uintptr_t)&b;
-	a = b > len ? b-len : 0;
-	if (new>a && old<b) return 1;
-
-	return 0;
-}
-
-/* Expand the heap in-place if brk can be used, or otherwise via mmap,
- * using an exponential lower bound on growth by mmap to make
- * fragmentation asymptotically irrelevant. The size argument is both
- * an input and an output, since the caller needs to know the size
- * allocated, which will be larger than requested due to page alignment
- * and mmap minimum size rules. The caller is responsible for locking
- * to prevent concurrent calls. */
-
-void *__expand_heap(size_t *pn)
-{
-	static uintptr_t brk;
-	static unsigned mmap_step;
-	size_t n = *pn;
-
-	if (n > SIZE_MAX/2 - PAGE_SIZE) {
-		errno = ENOMEM;
-		return 0;
-	}
-	n += -n & PAGE_SIZE-1;
-
-	if (!brk) {
-		brk = __syscall(SYS_brk, 0);
-		brk += -brk & PAGE_SIZE-1;
-	}
-
-	if (n < SIZE_MAX-brk && !traverses_stack_p(brk, brk+n)
-	    && __syscall(SYS_brk, brk+n)==brk+n) {
-		*pn = n;
-		brk += n;
-		return (void *)(brk-n);
-	}
-
-	size_t min = (size_t)PAGE_SIZE << mmap_step/2;
-	if (n < min) n = min;
-	void *area = __mmap(0, n, PROT_READ|PROT_WRITE,
-		MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-	if (area == MAP_FAILED) return 0;
-	*pn = n;
-	mmap_step++;
-	return area;
-}
diff --git a/src/malloc/free.c b/src/malloc/free.c
new file mode 100644
index 00000000..3944f7b2
--- /dev/null
+++ b/src/malloc/free.c
@@ -0,0 +1,6 @@
+#include <stdlib.h>
+
+void free(void *p)
+{
+	__libc_free(p);
+}
diff --git a/src/malloc/libc_calloc.c b/src/malloc/libc_calloc.c
new file mode 100644
index 00000000..d25eabea
--- /dev/null
+++ b/src/malloc/libc_calloc.c
@@ -0,0 +1,4 @@
+#define calloc __libc_calloc
+#define malloc __libc_malloc
+
+#include "calloc.c"
diff --git a/src/malloc/lite_malloc.c b/src/malloc/lite_malloc.c
index 050d84f6..43a988fb 100644
--- a/src/malloc/lite_malloc.c
+++ b/src/malloc/lite_malloc.c
@@ -2,58 +2,117 @@
 #include <stdint.h>
 #include <limits.h>
 #include <errno.h>
+#include <sys/mman.h>
+#include "libc.h"
 #include "lock.h"
-#include "malloc_impl.h"
+#include "syscall.h"
+#include "fork_impl.h"
 
 #define ALIGN 16
 
+/* This function returns true if the interval [old,new]
+ * intersects the 'len'-sized interval below &libc.auxv
+ * (interpreted as the main-thread stack) or below &b
+ * (the current stack). It is used to defend against
+ * buggy brk implementations that can cross the stack. */
+
+static int traverses_stack_p(uintptr_t old, uintptr_t new)
+{
+	const uintptr_t len = 8<<20;
+	uintptr_t a, b;
+
+	b = (uintptr_t)libc.auxv;
+	a = b > len ? b-len : 0;
+	if (new>a && old<b) return 1;
+
+	b = (uintptr_t)&b;
+	a = b > len ? b-len : 0;
+	if (new>a && old<b) return 1;
+
+	return 0;
+}
+
+static volatile int lock[1];
+volatile int *const __bump_lockptr = lock;
+
 static void *__simple_malloc(size_t n)
 {
-	static char *cur, *end;
-	static volatile int lock[1];
-	size_t align=1, pad;
+	static uintptr_t brk, cur, end;
+	static unsigned mmap_step;
+	size_t align=1;
 	void *p;
 
+	if (n > SIZE_MAX/2) {
+		errno = ENOMEM;
+		return 0;
+	}
+
 	if (!n) n++;
 	while (align<n && align<ALIGN)
 		align += align;
 
 	LOCK(lock);
 
-	pad = -(uintptr_t)cur & align-1;
-
-	if (n <= SIZE_MAX/2 + ALIGN) n += pad;
+	cur += -cur & align-1;
 
 	if (n > end-cur) {
-		size_t m = n;
-		char *new = __expand_heap(&m);
-		if (!new) {
-			UNLOCK(lock);
-			return 0;
+		size_t req = n - (end-cur) + PAGE_SIZE-1 & -PAGE_SIZE;
+
+		if (!cur) {
+			brk = __syscall(SYS_brk, 0);
+			brk += -brk & PAGE_SIZE-1;
+			cur = end = brk;
 		}
-		if (new != end) {
-			cur = new;
-			n -= pad;
-			pad = 0;
+
+		if (brk == end && req < SIZE_MAX-brk
+		    && !traverses_stack_p(brk, brk+req)
+		    && __syscall(SYS_brk, brk+req)==brk+req) {
+			brk = end += req;
+		} else {
+			int new_area = 0;
+			req = n + PAGE_SIZE-1 & -PAGE_SIZE;
+			/* Only make a new area rather than individual mmap
+			 * if wasted space would be over 1/8 of the map. */
+			if (req-n > req/8) {
+				/* Geometric area size growth up to 64 pages,
+				 * bounding waste by 1/8 of the area. */
+				size_t min = PAGE_SIZE<<(mmap_step/2);
+				if (min-n > end-cur) {
+					if (req < min) {
+						req = min;
+						if (mmap_step < 12)
+							mmap_step++;
+					}
+					new_area = 1;
+				}
+			}
+			void *mem = __mmap(0, req, PROT_READ|PROT_WRITE,
+				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+			if (mem == MAP_FAILED || !new_area) {
+				UNLOCK(lock);
+				return mem==MAP_FAILED ? 0 : mem;
+			}
+			cur = (uintptr_t)mem;
+			end = cur + req;
 		}
-		end = new + m;
 	}
 
-	p = cur + pad;
+	p = (void *)cur;
 	cur += n;
 	UNLOCK(lock);
 	return p;
 }
 
-weak_alias(__simple_malloc, malloc);
+weak_alias(__simple_malloc, __libc_malloc_impl);
 
-static void *__simple_calloc(size_t m, size_t n)
+void *__libc_malloc(size_t n)
 {
-	if (n && m > (size_t)-1/n) {
-		errno = ENOMEM;
-		return 0;
-	}
-	return __simple_malloc(n * m);
+	return __libc_malloc_impl(n);
+}
+
+static void *default_malloc(size_t n)
+{
+	return __libc_malloc_impl(n);
 }
 
-weak_alias(__simple_calloc, calloc);
+weak_alias(default_malloc, malloc);
diff --git a/src/malloc/mallocng/aligned_alloc.c b/src/malloc/mallocng/aligned_alloc.c
new file mode 100644
index 00000000..e0862a83
--- /dev/null
+++ b/src/malloc/mallocng/aligned_alloc.c
@@ -0,0 +1,60 @@
+#include <stdlib.h>
+#include <errno.h>
+#include "meta.h"
+
+void *aligned_alloc(size_t align, size_t len)
+{
+	if ((align & -align) != align) {
+		errno = EINVAL;
+		return 0;
+	}
+
+	if (len > SIZE_MAX - align || align >= (1ULL<<31)*UNIT) {
+		errno = ENOMEM;
+		return 0;
+	}
+
+	if (DISABLE_ALIGNED_ALLOC) {
+		errno = ENOMEM;
+		return 0;
+	}
+
+	if (align <= UNIT) align = UNIT;
+
+	unsigned char *p = malloc(len + align - UNIT);
+	if (!p)
+		return 0;
+
+	struct meta *g = get_meta(p);
+	int idx = get_slot_index(p);
+	size_t stride = get_stride(g);
+	unsigned char *start = g->mem->storage + stride*idx;
+	unsigned char *end = g->mem->storage + stride*(idx+1) - IB;
+	size_t adj = -(uintptr_t)p & (align-1);
+
+	if (!adj) {
+		set_size(p, end, len);
+		return p;
+	}
+	p += adj;
+	uint32_t offset = (size_t)(p-g->mem->storage)/UNIT;
+	if (offset <= 0xffff) {
+		*(uint16_t *)(p-2) = offset;
+		p[-4] = 0;
+	} else {
+		// use a 32-bit offset if 16-bit doesn't fit. for this,
+		// 16-bit field must be zero, [-4] byte nonzero.
+		*(uint16_t *)(p-2) = 0;
+		*(uint32_t *)(p-8) = offset;
+		p[-4] = 1;
+	}
+	p[-3] = idx;
+	set_size(p, end, len);
+	// store offset to aligned enframing. this facilitates cycling
+	// offset and also iteration of heap for debugging/measurement.
+	// for extreme overalignment it won't fit but these are classless
+	// allocations anyway.
+	*(uint16_t *)(start - 2) = (size_t)(p-start)/UNIT;
+	start[-3] = 7<<5;
+	return p;
+}
diff --git a/src/malloc/mallocng/donate.c b/src/malloc/mallocng/donate.c
new file mode 100644
index 00000000..41d850f3
--- /dev/null
+++ b/src/malloc/mallocng/donate.c
@@ -0,0 +1,39 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <limits.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "meta.h"
+
+static void donate(unsigned char *base, size_t len)
+{
+	uintptr_t a = (uintptr_t)base;
+	uintptr_t b = a + len;
+	a += -a & (UNIT-1);
+	b -= b & (UNIT-1);
+	memset(base, 0, len);
+	for (int sc=47; sc>0 && b>a; sc-=4) {
+		if (b-a < (size_classes[sc]+1)*UNIT) continue;
+		struct meta *m = alloc_meta();
+		m->avail_mask = 0;
+		m->freed_mask = 1;
+		m->mem = (void *)a;
+		m->mem->meta = m;
+		m->last_idx = 0;
+		m->freeable = 0;
+		m->sizeclass = sc;
+		m->maplen = 0;
+		*((unsigned char *)m->mem+UNIT-4) = 0;
+		*((unsigned char *)m->mem+UNIT-3) = 255;
+		m->mem->storage[size_classes[sc]*UNIT-4] = 0;
+		queue(&ctx.active[sc], m);
+		a += (size_classes[sc]+1)*UNIT;
+	}
+}
+
+void __malloc_donate(char *start, char *end)
+{
+	donate((void *)start, end-start);
+}
diff --git a/src/malloc/mallocng/free.c b/src/malloc/mallocng/free.c
new file mode 100644
index 00000000..43f32aad
--- /dev/null
+++ b/src/malloc/mallocng/free.c
@@ -0,0 +1,151 @@
+#define _BSD_SOURCE
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "meta.h"
+
+struct mapinfo {
+	void *base;
+	size_t len;
+};
+
+static struct mapinfo nontrivial_free(struct meta *, int);
+
+static struct mapinfo free_group(struct meta *g)
+{
+	struct mapinfo mi = { 0 };
+	int sc = g->sizeclass;
+	if (sc < 48) {
+		ctx.usage_by_class[sc] -= g->last_idx+1;
+	}
+	if (g->maplen) {
+		step_seq();
+		record_seq(sc);
+		mi.base = g->mem;
+		mi.len = g->maplen*4096UL;
+	} else {
+		void *p = g->mem;
+		struct meta *m = get_meta(p);
+		int idx = get_slot_index(p);
+		g->mem->meta = 0;
+		// not checking size/reserved here; it's intentionally invalid
+		mi = nontrivial_free(m, idx);
+	}
+	free_meta(g);
+	return mi;
+}
+
+static int okay_to_free(struct meta *g)
+{
+	int sc = g->sizeclass;
+
+	if (!g->freeable) return 0;
+
+	// always free individual mmaps not suitable for reuse
+	if (sc >= 48 || get_stride(g) < UNIT*size_classes[sc])
+		return 1;
+
+	// always free groups allocated inside another group's slot
+	// since recreating them should not be expensive and they
+	// might be blocking freeing of a much larger group.
+	if (!g->maplen) return 1;
+
+	// if there is another non-full group, free this one to
+	// consolidate future allocations, reduce fragmentation.
+	if (g->next != g) return 1;
+
+	// free any group in a size class that's not bouncing
+	if (!is_bouncing(sc)) return 1;
+
+	size_t cnt = g->last_idx+1;
+	size_t usage = ctx.usage_by_class[sc];
+
+	// if usage is high enough that a larger count should be
+	// used, free the low-count group so a new one will be made.
+	if (9*cnt <= usage && cnt < 20)
+		return 1;
+
+	// otherwise, keep the last group in a bouncing class.
+	return 0;
+}
+
+static struct mapinfo nontrivial_free(struct meta *g, int i)
+{
+	uint32_t self = 1u<<i;
+	int sc = g->sizeclass;
+	uint32_t mask = g->freed_mask | g->avail_mask;
+
+	if (mask+self == (2u<<g->last_idx)-1 && okay_to_free(g)) {
+		// any multi-slot group is necessarily on an active list
+		// here, but single-slot groups might or might not be.
+		if (g->next) {
+			assert(sc < 48);
+			int activate_new = (ctx.active[sc]==g);
+			dequeue(&ctx.active[sc], g);
+			if (activate_new && ctx.active[sc])
+				activate_group(ctx.active[sc]);
+		}
+		return free_group(g);
+	} else if (!mask) {
+		assert(sc < 48);
+		// might still be active if there were no allocations
+		// after last available slot was taken.
+		if (ctx.active[sc] != g) {
+			queue(&ctx.active[sc], g);
+		}
+	}
+	a_or(&g->freed_mask, self);
+	return (struct mapinfo){ 0 };
+}
+
+void free(void *p)
+{
+	if (!p) return;
+
+	struct meta *g = get_meta(p);
+	int idx = get_slot_index(p);
+	size_t stride = get_stride(g);
+	unsigned char *start = g->mem->storage + stride*idx;
+	unsigned char *end = start + stride - IB;
+	get_nominal_size(p, end);
+	uint32_t self = 1u<<idx, all = (2u<<g->last_idx)-1;
+	((unsigned char *)p)[-3] = 255;
+	// invalidate offset to group header, and cycle offset of
+	// used region within slot if current offset is zero.
+	*(uint16_t *)((char *)p-2) = 0;
+
+	// release any whole pages contained in the slot to be freed
+	// unless it's a single-slot group that will be unmapped.
+	if (((uintptr_t)(start-1) ^ (uintptr_t)end) >= 2*PGSZ && g->last_idx) {
+		unsigned char *base = start + (-(uintptr_t)start & (PGSZ-1));
+		size_t len = (end-base) & -PGSZ;
+		if (len && USE_MADV_FREE) {
+			int e = errno;
+			madvise(base, len, MADV_FREE);
+			errno = e;
+		}
+	}
+
+	// atomic free without locking if this is neither first or last slot
+	for (;;) {
+		uint32_t freed = g->freed_mask;
+		uint32_t avail = g->avail_mask;
+		uint32_t mask = freed | avail;
+		assert(!(mask&self));
+		if (!freed || mask+self==all) break;
+		if (!MT)
+			g->freed_mask = freed+self;
+		else if (a_cas(&g->freed_mask, freed, freed+self)!=freed)
+			continue;
+		return;
+	}
+
+	wrlock();
+	struct mapinfo mi = nontrivial_free(g, idx);
+	unlock();
+	if (mi.len) {
+		int e = errno;
+		munmap(mi.base, mi.len);
+		errno = e;
+	}
+}
diff --git a/src/malloc/mallocng/glue.h b/src/malloc/mallocng/glue.h
new file mode 100644
index 00000000..77f4c812
--- /dev/null
+++ b/src/malloc/mallocng/glue.h
@@ -0,0 +1,95 @@
+#ifndef MALLOC_GLUE_H
+#define MALLOC_GLUE_H
+
+#include <stdint.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <elf.h>
+#include <string.h>
+#include "atomic.h"
+#include "syscall.h"
+#include "libc.h"
+#include "lock.h"
+#include "dynlink.h"
+
+// use macros to appropriately namespace these.
+#define size_classes __malloc_size_classes
+#define ctx __malloc_context
+#define alloc_meta __malloc_alloc_meta
+#define is_allzero __malloc_allzerop
+#define dump_heap __dump_heap
+
+#define malloc __libc_malloc_impl
+#define realloc __libc_realloc
+#define free __libc_free
+
+#define USE_MADV_FREE 0
+
+#if USE_REAL_ASSERT
+#include <assert.h>
+#else
+#undef assert
+#define assert(x) do { if (!(x)) a_crash(); } while(0)
+#endif
+
+#define brk(p) ((uintptr_t)__syscall(SYS_brk, p))
+
+#define mmap __mmap
+#define madvise __madvise
+#define mremap __mremap
+
+#define DISABLE_ALIGNED_ALLOC (__malloc_replaced && !__aligned_alloc_replaced)
+
+static inline uint64_t get_random_secret()
+{
+	uint64_t secret = (uintptr_t)&secret * 1103515245;
+	for (size_t i=0; libc.auxv[i]; i+=2)
+		if (libc.auxv[i]==AT_RANDOM)
+			memcpy(&secret, (char *)libc.auxv[i+1]+8, sizeof secret);
+	return secret;
+}
+
+#ifndef PAGESIZE
+#define PAGESIZE PAGE_SIZE
+#endif
+
+#define MT (libc.need_locks)
+
+#define RDLOCK_IS_EXCLUSIVE 1
+
+__attribute__((__visibility__("hidden")))
+extern int __malloc_lock[1];
+
+#define LOCK_OBJ_DEF \
+int __malloc_lock[1]; \
+void __malloc_atfork(int who) { malloc_atfork(who); }
+
+static inline void rdlock()
+{
+	if (MT) LOCK(__malloc_lock);
+}
+static inline void wrlock()
+{
+	if (MT) LOCK(__malloc_lock);
+}
+static inline void unlock()
+{
+	UNLOCK(__malloc_lock);
+}
+static inline void upgradelock()
+{
+}
+static inline void resetlock()
+{
+	__malloc_lock[0] = 0;
+}
+
+static inline void malloc_atfork(int who)
+{
+	if (who<0) rdlock();
+	else if (who>0) resetlock();
+	else unlock();
+}
+
+#endif
diff --git a/src/malloc/mallocng/malloc.c b/src/malloc/mallocng/malloc.c
new file mode 100644
index 00000000..d695ab8e
--- /dev/null
+++ b/src/malloc/mallocng/malloc.c
@@ -0,0 +1,387 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <limits.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "meta.h"
+
+LOCK_OBJ_DEF;
+
+const uint16_t size_classes[] = {
+	1, 2, 3, 4, 5, 6, 7, 8,
+	9, 10, 12, 15,
+	18, 20, 25, 31,
+	36, 42, 50, 63,
+	72, 84, 102, 127,
+	146, 170, 204, 255,
+	292, 340, 409, 511,
+	584, 682, 818, 1023,
+	1169, 1364, 1637, 2047,
+	2340, 2730, 3276, 4095,
+	4680, 5460, 6552, 8191,
+};
+
+static const uint8_t small_cnt_tab[][3] = {
+	{ 30, 30, 30 },
+	{ 31, 15, 15 },
+	{ 20, 10, 10 },
+	{ 31, 15, 7 },
+	{ 25, 12, 6 },
+	{ 21, 10, 5 },
+	{ 18, 8, 4 },
+	{ 31, 15, 7 },
+	{ 28, 14, 6 },
+};
+
+static const uint8_t med_cnt_tab[4] = { 28, 24, 20, 32 };
+
+struct malloc_context ctx = { 0 };
+
+struct meta *alloc_meta(void)
+{
+	struct meta *m;
+	unsigned char *p;
+	if (!ctx.init_done) {
+#ifndef PAGESIZE
+		ctx.pagesize = get_page_size();
+#endif
+		ctx.secret = get_random_secret();
+		ctx.init_done = 1;
+	}
+	size_t pagesize = PGSZ;
+	if (pagesize < 4096) pagesize = 4096;
+	if ((m = dequeue_head(&ctx.free_meta_head))) return m;
+	if (!ctx.avail_meta_count) {
+		int need_unprotect = 1;
+		if (!ctx.avail_meta_area_count && ctx.brk!=-1) {
+			uintptr_t new = ctx.brk + pagesize;
+			int need_guard = 0;
+			if (!ctx.brk) {
+				need_guard = 1;
+				ctx.brk = brk(0);
+				// some ancient kernels returned _ebss
+				// instead of next page as initial brk.
+				ctx.brk += -ctx.brk & (pagesize-1);
+				new = ctx.brk + 2*pagesize;
+			}
+			if (brk(new) != new) {
+				ctx.brk = -1;
+			} else {
+				if (need_guard) mmap((void *)ctx.brk, pagesize,
+					PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
+				ctx.brk = new;
+				ctx.avail_meta_areas = (void *)(new - pagesize);
+				ctx.avail_meta_area_count = pagesize>>12;
+				need_unprotect = 0;
+			}
+		}
+		if (!ctx.avail_meta_area_count) {
+			size_t n = 2UL << ctx.meta_alloc_shift;
+			p = mmap(0, n*pagesize, PROT_NONE,
+				MAP_PRIVATE|MAP_ANON, -1, 0);
+			if (p==MAP_FAILED) return 0;
+			ctx.avail_meta_areas = p + pagesize;
+			ctx.avail_meta_area_count = (n-1)*(pagesize>>12);
+			ctx.meta_alloc_shift++;
+		}
+		p = ctx.avail_meta_areas;
+		if ((uintptr_t)p & (pagesize-1)) need_unprotect = 0;
+		if (need_unprotect)
+			if (mprotect(p, pagesize, PROT_READ|PROT_WRITE)
+			    && errno != ENOSYS)
+				return 0;
+		ctx.avail_meta_area_count--;
+		ctx.avail_meta_areas = p + 4096;
+		if (ctx.meta_area_tail) {
+			ctx.meta_area_tail->next = (void *)p;
+		} else {
+			ctx.meta_area_head = (void *)p;
+		}
+		ctx.meta_area_tail = (void *)p;
+		ctx.meta_area_tail->check = ctx.secret;
+		ctx.avail_meta_count = ctx.meta_area_tail->nslots
+			= (4096-sizeof(struct meta_area))/sizeof *m;
+		ctx.avail_meta = ctx.meta_area_tail->slots;
+	}
+	ctx.avail_meta_count--;
+	m = ctx.avail_meta++;
+	m->prev = m->next = 0;
+	return m;
+}
+
+static uint32_t try_avail(struct meta **pm)
+{
+	struct meta *m = *pm;
+	uint32_t first;
+	if (!m) return 0;
+	uint32_t mask = m->avail_mask;
+	if (!mask) {
+		if (!m) return 0;
+		if (!m->freed_mask) {
+			dequeue(pm, m);
+			m = *pm;
+			if (!m) return 0;
+		} else {
+			m = m->next;
+			*pm = m;
+		}
+
+		mask = m->freed_mask;
+
+		// skip fully-free group unless it's the only one
+		// or it's a permanently non-freeable group
+		if (mask == (2u<<m->last_idx)-1 && m->freeable) {
+			m = m->next;
+			*pm = m;
+			mask = m->freed_mask;
+		}
+
+		// activate more slots in a not-fully-active group
+		// if needed, but only as a last resort. prefer using
+		// any other group with free slots. this avoids
+		// touching & dirtying as-yet-unused pages.
+		if (!(mask & ((2u<<m->mem->active_idx)-1))) {
+			if (m->next != m) {
+				m = m->next;
+				*pm = m;
+			} else {
+				int cnt = m->mem->active_idx + 2;
+				int size = size_classes[m->sizeclass]*UNIT;
+				int span = UNIT + size*cnt;
+				// activate up to next 4k boundary
+				while ((span^(span+size-1)) < 4096) {
+					cnt++;
+					span += size;
+				}
+				if (cnt > m->last_idx+1)
+					cnt = m->last_idx+1;
+				m->mem->active_idx = cnt-1;
+			}
+		}
+		mask = activate_group(m);
+		assert(mask);
+		decay_bounces(m->sizeclass);
+	}
+	first = mask&-mask;
+	m->avail_mask = mask-first;
+	return first;
+}
+
+static int alloc_slot(int, size_t);
+
+static struct meta *alloc_group(int sc, size_t req)
+{
+	size_t size = UNIT*size_classes[sc];
+	int i = 0, cnt;
+	unsigned char *p;
+	struct meta *m = alloc_meta();
+	if (!m) return 0;
+	size_t usage = ctx.usage_by_class[sc];
+	size_t pagesize = PGSZ;
+	int active_idx;
+	if (sc < 9) {
+		while (i<2 && 4*small_cnt_tab[sc][i] > usage)
+			i++;
+		cnt = small_cnt_tab[sc][i];
+	} else {
+		// lookup max number of slots fitting in power-of-two size
+		// from a table, along with number of factors of two we
+		// can divide out without a remainder or reaching 1.
+		cnt = med_cnt_tab[sc&3];
+
+		// reduce cnt to avoid excessive eagar allocation.
+		while (!(cnt&1) && 4*cnt > usage)
+			cnt >>= 1;
+
+		// data structures don't support groups whose slot offsets
+		// in units don't fit in 16 bits.
+		while (size*cnt >= 65536*UNIT)
+			cnt >>= 1;
+	}
+
+	// If we selected a count of 1 above but it's not sufficient to use
+	// mmap, increase to 2. Then it might be; if not it will nest.
+	if (cnt==1 && size*cnt+UNIT <= pagesize/2) cnt = 2;
+
+	// All choices of size*cnt are "just below" a power of two, so anything
+	// larger than half the page size should be allocated as whole pages.
+	if (size*cnt+UNIT > pagesize/2) {
+		// check/update bounce counter to start/increase retention
+		// of freed maps, and inhibit use of low-count, odd-size
+		// small mappings and single-slot groups if activated.
+		int nosmall = is_bouncing(sc);
+		account_bounce(sc);
+		step_seq();
+
+		// since the following count reduction opportunities have
+		// an absolute memory usage cost, don't overdo them. count
+		// coarse usage as part of usage.
+		if (!(sc&1) && sc<32) usage += ctx.usage_by_class[sc+1];
+
+		// try to drop to a lower count if the one found above
+		// increases usage by more than 25%. these reduced counts
+		// roughly fill an integral number of pages, just not a
+		// power of two, limiting amount of unusable space.
+		if (4*cnt > usage && !nosmall) {
+			if (0);
+			else if ((sc&3)==1 && size*cnt>8*pagesize) cnt = 2;
+			else if ((sc&3)==2 && size*cnt>4*pagesize) cnt = 3;
+			else if ((sc&3)==0 && size*cnt>8*pagesize) cnt = 3;
+			else if ((sc&3)==0 && size*cnt>2*pagesize) cnt = 5;
+		}
+		size_t needed = size*cnt + UNIT;
+		needed += -needed & (pagesize-1);
+
+		// produce an individually-mmapped allocation if usage is low,
+		// bounce counter hasn't triggered, and either it saves memory
+		// or it avoids eagar slot allocation without wasting too much.
+		if (!nosmall && cnt<=7) {
+			req += IB + UNIT;
+			req += -req & (pagesize-1);
+			if (req<size+UNIT || (req>=4*pagesize && 2*cnt>usage)) {
+				cnt = 1;
+				needed = req;
+			}
+		}
+
+		p = mmap(0, needed, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
+		if (p==MAP_FAILED) {
+			free_meta(m);
+			return 0;
+		}
+		m->maplen = needed>>12;
+		ctx.mmap_counter++;
+		active_idx = (4096-UNIT)/size-1;
+		if (active_idx > cnt-1) active_idx = cnt-1;
+		if (active_idx < 0) active_idx = 0;
+	} else {
+		int j = size_to_class(UNIT+cnt*size-IB);
+		int idx = alloc_slot(j, UNIT+cnt*size-IB);
+		if (idx < 0) {
+			free_meta(m);
+			return 0;
+		}
+		struct meta *g = ctx.active[j];
+		p = enframe(g, idx, UNIT*size_classes[j]-IB, ctx.mmap_counter);
+		m->maplen = 0;
+		p[-3] = (p[-3]&31) | (6<<5);
+		for (int i=0; i<=cnt; i++)
+			p[UNIT+i*size-4] = 0;
+		active_idx = cnt-1;
+	}
+	ctx.usage_by_class[sc] += cnt;
+	m->avail_mask = (2u<<active_idx)-1;
+	m->freed_mask = (2u<<(cnt-1))-1 - m->avail_mask;
+	m->mem = (void *)p;
+	m->mem->meta = m;
+	m->mem->active_idx = active_idx;
+	m->last_idx = cnt-1;
+	m->freeable = 1;
+	m->sizeclass = sc;
+	return m;
+}
+
+static int alloc_slot(int sc, size_t req)
+{
+	uint32_t first = try_avail(&ctx.active[sc]);
+	if (first) return a_ctz_32(first);
+
+	struct meta *g = alloc_group(sc, req);
+	if (!g) return -1;
+
+	g->avail_mask--;
+	queue(&ctx.active[sc], g);
+	return 0;
+}
+
+void *malloc(size_t n)
+{
+	if (size_overflows(n)) return 0;
+	struct meta *g;
+	uint32_t mask, first;
+	int sc;
+	int idx;
+	int ctr;
+
+	if (n >= MMAP_THRESHOLD) {
+		size_t needed = n + IB + UNIT;
+		void *p = mmap(0, needed, PROT_READ|PROT_WRITE,
+			MAP_PRIVATE|MAP_ANON, -1, 0);
+		if (p==MAP_FAILED) return 0;
+		wrlock();
+		step_seq();
+		g = alloc_meta();
+		if (!g) {
+			unlock();
+			munmap(p, needed);
+			return 0;
+		}
+		g->mem = p;
+		g->mem->meta = g;
+		g->last_idx = 0;
+		g->freeable = 1;
+		g->sizeclass = 63;
+		g->maplen = (needed+4095)/4096;
+		g->avail_mask = g->freed_mask = 0;
+		// use a global counter to cycle offset in
+		// individually-mmapped allocations.
+		ctx.mmap_counter++;
+		idx = 0;
+		goto success;
+	}
+
+	sc = size_to_class(n);
+
+	rdlock();
+	g = ctx.active[sc];
+
+	// use coarse size classes initially when there are not yet
+	// any groups of desired size. this allows counts of 2 or 3
+	// to be allocated at first rather than having to start with
+	// 7 or 5, the min counts for even size classes.
+	if (!g && sc>=4 && sc<32 && sc!=6 && !(sc&1) && !ctx.usage_by_class[sc]) {
+		size_t usage = ctx.usage_by_class[sc|1];
+		// if a new group may be allocated, count it toward
+		// usage in deciding if we can use coarse class.
+		if (!ctx.active[sc|1] || (!ctx.active[sc|1]->avail_mask
+		    && !ctx.active[sc|1]->freed_mask))
+			usage += 3;
+		if (usage <= 12)
+			sc |= 1;
+		g = ctx.active[sc];
+	}
+
+	for (;;) {
+		mask = g ? g->avail_mask : 0;
+		first = mask&-mask;
+		if (!first) break;
+		if (RDLOCK_IS_EXCLUSIVE || !MT)
+			g->avail_mask = mask-first;
+		else if (a_cas(&g->avail_mask, mask, mask-first)!=mask)
+			continue;
+		idx = a_ctz_32(first);
+		goto success;
+	}
+	upgradelock();
+
+	idx = alloc_slot(sc, n);
+	if (idx < 0) {
+		unlock();
+		return 0;
+	}
+	g = ctx.active[sc];
+
+success:
+	ctr = ctx.mmap_counter;
+	unlock();
+	return enframe(g, idx, n, ctr);
+}
+
+int is_allzero(void *p)
+{
+	struct meta *g = get_meta(p);
+	return g->sizeclass >= 48 ||
+		get_stride(g) < UNIT*size_classes[g->sizeclass];
+}
diff --git a/src/malloc/mallocng/malloc_usable_size.c b/src/malloc/mallocng/malloc_usable_size.c
new file mode 100644
index 00000000..ce6a960c
--- /dev/null
+++ b/src/malloc/mallocng/malloc_usable_size.c
@@ -0,0 +1,13 @@
+#include <stdlib.h>
+#include "meta.h"
+
+size_t malloc_usable_size(void *p)
+{
+	if (!p) return 0;
+	struct meta *g = get_meta(p);
+	int idx = get_slot_index(p);
+	size_t stride = get_stride(g);
+	unsigned char *start = g->mem->storage + stride*idx;
+	unsigned char *end = start + stride - IB;
+	return get_nominal_size(p, end);
+}
diff --git a/src/malloc/mallocng/meta.h b/src/malloc/mallocng/meta.h
new file mode 100644
index 00000000..61ec53f9
--- /dev/null
+++ b/src/malloc/mallocng/meta.h
@@ -0,0 +1,288 @@
+#ifndef MALLOC_META_H
+#define MALLOC_META_H
+
+#include <stdint.h>
+#include <errno.h>
+#include <limits.h>
+#include "glue.h"
+
+__attribute__((__visibility__("hidden")))
+extern const uint16_t size_classes[];
+
+#define MMAP_THRESHOLD 131052
+
+#define UNIT 16
+#define IB 4
+
+struct group {
+	struct meta *meta;
+	unsigned char active_idx:5;
+	char pad[UNIT - sizeof(struct meta *) - 1];
+	unsigned char storage[];
+};
+
+struct meta {
+	struct meta *prev, *next;
+	struct group *mem;
+	volatile int avail_mask, freed_mask;
+	uintptr_t last_idx:5;
+	uintptr_t freeable:1;
+	uintptr_t sizeclass:6;
+	uintptr_t maplen:8*sizeof(uintptr_t)-12;
+};
+
+struct meta_area {
+	uint64_t check;
+	struct meta_area *next;
+	int nslots;
+	struct meta slots[];
+};
+
+struct malloc_context {
+	uint64_t secret;
+#ifndef PAGESIZE
+	size_t pagesize;
+#endif
+	int init_done;
+	unsigned mmap_counter;
+	struct meta *free_meta_head;
+	struct meta *avail_meta;
+	size_t avail_meta_count, avail_meta_area_count, meta_alloc_shift;
+	struct meta_area *meta_area_head, *meta_area_tail;
+	unsigned char *avail_meta_areas;
+	struct meta *active[48];
+	size_t usage_by_class[48];
+	uint8_t unmap_seq[32], bounces[32];
+	uint8_t seq;
+	uintptr_t brk;
+};
+
+__attribute__((__visibility__("hidden")))
+extern struct malloc_context ctx;
+
+#ifdef PAGESIZE
+#define PGSZ PAGESIZE
+#else
+#define PGSZ ctx.pagesize
+#endif
+
+__attribute__((__visibility__("hidden")))
+struct meta *alloc_meta(void);
+
+__attribute__((__visibility__("hidden")))
+int is_allzero(void *);
+
+static inline void queue(struct meta **phead, struct meta *m)
+{
+	assert(!m->next);
+	assert(!m->prev);
+	if (*phead) {
+		struct meta *head = *phead;
+		m->next = head;
+		m->prev = head->prev;
+		m->next->prev = m->prev->next = m;
+	} else {
+		m->prev = m->next = m;
+		*phead = m;
+	}
+}
+
+static inline void dequeue(struct meta **phead, struct meta *m)
+{
+	if (m->next != m) {
+		m->prev->next = m->next;
+		m->next->prev = m->prev;
+		if (*phead == m) *phead = m->next;
+	} else {
+		*phead = 0;
+	}
+	m->prev = m->next = 0;
+}
+
+static inline struct meta *dequeue_head(struct meta **phead)
+{
+	struct meta *m = *phead;
+	if (m) dequeue(phead, m);
+	return m;
+}
+
+static inline void free_meta(struct meta *m)
+{
+	*m = (struct meta){0};
+	queue(&ctx.free_meta_head, m);
+}
+
+static inline uint32_t activate_group(struct meta *m)
+{
+	assert(!m->avail_mask);
+	uint32_t mask, act = (2u<<m->mem->active_idx)-1;
+	do mask = m->freed_mask;
+	while (a_cas(&m->freed_mask, mask, mask&~act)!=mask);
+	return m->avail_mask = mask & act;
+}
+
+static inline int get_slot_index(const unsigned char *p)
+{
+	return p[-3] & 31;
+}
+
+static inline struct meta *get_meta(const unsigned char *p)
+{
+	assert(!((uintptr_t)p & 15));
+	int offset = *(const uint16_t *)(p - 2);
+	int index = get_slot_index(p);
+	if (p[-4]) {
+		assert(!offset);
+		offset = *(uint32_t *)(p - 8);
+		assert(offset > 0xffff);
+	}
+	const struct group *base = (const void *)(p - UNIT*offset - UNIT);
+	const struct meta *meta = base->meta;
+	assert(meta->mem == base);
+	assert(index <= meta->last_idx);
+	assert(!(meta->avail_mask & (1u<<index)));
+	assert(!(meta->freed_mask & (1u<<index)));
+	const struct meta_area *area = (void *)((uintptr_t)meta & -4096);
+	assert(area->check == ctx.secret);
+	if (meta->sizeclass < 48) {
+		assert(offset >= size_classes[meta->sizeclass]*index);
+		assert(offset < size_classes[meta->sizeclass]*(index+1));
+	} else {
+		assert(meta->sizeclass == 63);
+	}
+	if (meta->maplen) {
+		assert(offset <= meta->maplen*4096UL/UNIT - 1);
+	}
+	return (struct meta *)meta;
+}
+
+static inline size_t get_nominal_size(const unsigned char *p, const unsigned char *end)
+{
+	size_t reserved = p[-3] >> 5;
+	if (reserved >= 5) {
+		assert(reserved == 5);
+		reserved = *(const uint32_t *)(end-4);
+		assert(reserved >= 5);
+		assert(!end[-5]);
+	}
+	assert(reserved <= end-p);
+	assert(!*(end-reserved));
+	// also check the slot's overflow byte
+	assert(!*end);
+	return end-reserved-p;
+}
+
+static inline size_t get_stride(const struct meta *g)
+{
+	if (!g->last_idx && g->maplen) {
+		return g->maplen*4096UL - UNIT;
+	} else {
+		return UNIT*size_classes[g->sizeclass];
+	}
+}
+
+static inline void set_size(unsigned char *p, unsigned char *end, size_t n)
+{
+	int reserved = end-p-n;
+	if (reserved) end[-reserved] = 0;
+	if (reserved >= 5) {
+		*(uint32_t *)(end-4) = reserved;
+		end[-5] = 0;
+		reserved = 5;
+	}
+	p[-3] = (p[-3]&31) + (reserved<<5);
+}
+
+static inline void *enframe(struct meta *g, int idx, size_t n, int ctr)
+{
+	size_t stride = get_stride(g);
+	size_t slack = (stride-IB-n)/UNIT;
+	unsigned char *p = g->mem->storage + stride*idx;
+	unsigned char *end = p+stride-IB;
+	// cycle offset within slot to increase interval to address
+	// reuse, facilitate trapping double-free.
+	int off = (p[-3] ? *(uint16_t *)(p-2) + 1 : ctr) & 255;
+	assert(!p[-4]);
+	if (off > slack) {
+		size_t m = slack;
+		m |= m>>1; m |= m>>2; m |= m>>4;
+		off &= m;
+		if (off > slack) off -= slack+1;
+		assert(off <= slack);
+	}
+	if (off) {
+		// store offset in unused header at offset zero
+		// if enframing at non-zero offset.
+		*(uint16_t *)(p-2) = off;
+		p[-3] = 7<<5;
+		p += UNIT*off;
+		// for nonzero offset there is no permanent check
+		// byte, so make one.
+		p[-4] = 0;
+	}
+	*(uint16_t *)(p-2) = (size_t)(p-g->mem->storage)/UNIT;
+	p[-3] = idx;
+	set_size(p, end, n);
+	return p;
+}
+
+static inline int size_to_class(size_t n)
+{
+	n = (n+IB-1)>>4;
+	if (n<10) return n;
+	n++;
+	int i = (28-a_clz_32(n))*4 + 8;
+	if (n>size_classes[i+1]) i+=2;
+	if (n>size_classes[i]) i++;
+	return i;
+}
+
+static inline int size_overflows(size_t n)
+{
+	if (n >= SIZE_MAX/2 - 4096) {
+		errno = ENOMEM;
+		return 1;
+	}
+	return 0;
+}
+
+static inline void step_seq(void)
+{
+	if (ctx.seq==255) {
+		for (int i=0; i<32; i++) ctx.unmap_seq[i] = 0;
+		ctx.seq = 1;
+	} else {
+		ctx.seq++;
+	}
+}
+
+static inline void record_seq(int sc)
+{
+	if (sc-7U < 32) ctx.unmap_seq[sc-7] = ctx.seq;
+}
+
+static inline void account_bounce(int sc)
+{
+	if (sc-7U < 32) {
+		int seq = ctx.unmap_seq[sc-7];
+		if (seq && ctx.seq-seq < 10) {
+			if (ctx.bounces[sc-7]+1 < 100)
+				ctx.bounces[sc-7]++;
+			else
+				ctx.bounces[sc-7] = 150;
+		}
+	}
+}
+
+static inline void decay_bounces(int sc)
+{
+	if (sc-7U < 32 && ctx.bounces[sc-7])
+		ctx.bounces[sc-7]--;
+}
+
+static inline int is_bouncing(int sc)
+{
+	return (sc-7U < 32 && ctx.bounces[sc-7] >= 100);
+}
+
+#endif
diff --git a/src/malloc/mallocng/realloc.c b/src/malloc/mallocng/realloc.c
new file mode 100644
index 00000000..18769f42
--- /dev/null
+++ b/src/malloc/mallocng/realloc.c
@@ -0,0 +1,51 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <string.h>
+#include "meta.h"
+
+void *realloc(void *p, size_t n)
+{
+	if (!p) return malloc(n);
+	if (size_overflows(n)) return 0;
+
+	struct meta *g = get_meta(p);
+	int idx = get_slot_index(p);
+	size_t stride = get_stride(g);
+	unsigned char *start = g->mem->storage + stride*idx;
+	unsigned char *end = start + stride - IB;
+	size_t old_size = get_nominal_size(p, end);
+	size_t avail_size = end-(unsigned char *)p;
+	void *new;
+
+	// only resize in-place if size class matches
+	if (n <= avail_size && n<MMAP_THRESHOLD
+	    && size_to_class(n)+1 >= g->sizeclass) {
+		set_size(p, end, n);
+		return p;
+	}
+
+	// use mremap if old and new size are both mmap-worthy
+	if (g->sizeclass>=48 && n>=MMAP_THRESHOLD) {
+		assert(g->sizeclass==63);
+		size_t base = (unsigned char *)p-start;
+		size_t needed = (n + base + UNIT + IB + 4095) & -4096;
+		new = g->maplen*4096UL == needed ? g->mem :
+			mremap(g->mem, g->maplen*4096UL, needed, MREMAP_MAYMOVE);
+		if (new!=MAP_FAILED) {
+			g->mem = new;
+			g->maplen = needed/4096;
+			p = g->mem->storage + base;
+			end = g->mem->storage + (needed - UNIT) - IB;
+			*end = 0;
+			set_size(p, end, n);
+			return p;
+		}
+	}
+
+	new = malloc(n);
+	if (!new) return 0;
+	memcpy(new, p, n < old_size ? n : old_size);
+	free(p);
+	return new;
+}
diff --git a/src/malloc/memalign.c b/src/malloc/memalign.c
index cf9dfbda..32cd87d8 100644
--- a/src/malloc/memalign.c
+++ b/src/malloc/memalign.c
@@ -1,54 +1,7 @@
+#define _BSD_SOURCE
 #include <stdlib.h>
-#include <stdint.h>
-#include <errno.h>
-#include "malloc_impl.h"
 
-void *__memalign(size_t align, size_t len)
+void *memalign(size_t align, size_t len)
 {
-	unsigned char *mem, *new;
-
-	if ((align & -align) != align) {
-		errno = EINVAL;
-		return 0;
-	}
-
-	if (len > SIZE_MAX - align || __malloc_replaced) {
-		errno = ENOMEM;
-		return 0;
-	}
-
-	if (align <= SIZE_ALIGN)
-		return malloc(len);
-
-	if (!(mem = malloc(len + align-1)))
-		return 0;
-
-	new = (void *)((uintptr_t)mem + align-1 & -align);
-	if (new == mem) return mem;
-
-	struct chunk *c = MEM_TO_CHUNK(mem);
-	struct chunk *n = MEM_TO_CHUNK(new);
-
-	if (IS_MMAPPED(c)) {
-		/* Apply difference between aligned and original
-		 * address to the "extra" field of mmapped chunk. */
-		n->psize = c->psize + (new-mem);
-		n->csize = c->csize - (new-mem);
-		return new;
-	}
-
-	struct chunk *t = NEXT_CHUNK(c);
-
-	/* Split the allocated chunk into two chunks. The aligned part
-	 * that will be used has the size in its footer reduced by the
-	 * difference between the aligned and original addresses, and
-	 * the resulting size copied to its header. A new header and
-	 * footer are written for the split-off part to be freed. */
-	n->psize = c->csize = C_INUSE | (new-mem);
-	n->csize = t->psize -= new-mem;
-
-	__bin_chunk(c);
-	return new;
+	return aligned_alloc(align, len);
 }
-
-weak_alias(__memalign, memalign);
diff --git a/src/malloc/oldmalloc/aligned_alloc.c b/src/malloc/oldmalloc/aligned_alloc.c
new file mode 100644
index 00000000..4adca3b4
--- /dev/null
+++ b/src/malloc/oldmalloc/aligned_alloc.c
@@ -0,0 +1,53 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+#include "malloc_impl.h"
+
+void *aligned_alloc(size_t align, size_t len)
+{
+	unsigned char *mem, *new;
+
+	if ((align & -align) != align) {
+		errno = EINVAL;
+		return 0;
+	}
+
+	if (len > SIZE_MAX - align ||
+	    (__malloc_replaced && !__aligned_alloc_replaced)) {
+		errno = ENOMEM;
+		return 0;
+	}
+
+	if (align <= SIZE_ALIGN)
+		return malloc(len);
+
+	if (!(mem = malloc(len + align-1)))
+		return 0;
+
+	new = (void *)((uintptr_t)mem + align-1 & -align);
+	if (new == mem) return mem;
+
+	struct chunk *c = MEM_TO_CHUNK(mem);
+	struct chunk *n = MEM_TO_CHUNK(new);
+
+	if (IS_MMAPPED(c)) {
+		/* Apply difference between aligned and original
+		 * address to the "extra" field of mmapped chunk. */
+		n->psize = c->psize + (new-mem);
+		n->csize = c->csize - (new-mem);
+		return new;
+	}
+
+	struct chunk *t = NEXT_CHUNK(c);
+
+	/* Split the allocated chunk into two chunks. The aligned part
+	 * that will be used has the size in its footer reduced by the
+	 * difference between the aligned and original addresses, and
+	 * the resulting size copied to its header. A new header and
+	 * footer are written for the split-off part to be freed. */
+	n->psize = c->csize = C_INUSE | (new-mem);
+	n->csize = t->psize -= new-mem;
+
+	__bin_chunk(c);
+	return new;
+}
diff --git a/src/malloc/malloc.c b/src/malloc/oldmalloc/malloc.c
index 96982596..25d00d44 100644
--- a/src/malloc/malloc.c
+++ b/src/malloc/oldmalloc/malloc.c
@@ -9,6 +9,11 @@
 #include "atomic.h"
 #include "pthread_impl.h"
 #include "malloc_impl.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc_impl
+#define realloc __libc_realloc
+#define free __libc_free
 
 #if defined(__GNUC__) && defined(__PIC__)
 #define inline inline __attribute__((always_inline))
@@ -17,17 +22,18 @@
 static struct {
 	volatile uint64_t binmap;
 	struct bin bins[64];
-	volatile int free_lock[2];
+	volatile int split_merge_lock[2];
 } mal;
 
-int __malloc_replaced;
-
 /* Synchronization tools */
 
 static inline void lock(volatile int *lk)
 {
-	if (libc.threads_minus_1)
+	int need_locks = libc.need_locks;
+	if (need_locks) {
 		while(a_swap(lk, 1)) __wait(lk, lk+1, 1, 1);
+		if (need_locks < 0) libc.need_locks = 0;
+	}
 }
 
 static inline void unlock(volatile int *lk)
@@ -123,9 +129,72 @@ void __dump_heap(int x)
 }
 #endif
 
+/* This function returns true if the interval [old,new]
+ * intersects the 'len'-sized interval below &libc.auxv
+ * (interpreted as the main-thread stack) or below &b
+ * (the current stack). It is used to defend against
+ * buggy brk implementations that can cross the stack. */
+
+static int traverses_stack_p(uintptr_t old, uintptr_t new)
+{
+	const uintptr_t len = 8<<20;
+	uintptr_t a, b;
+
+	b = (uintptr_t)libc.auxv;
+	a = b > len ? b-len : 0;
+	if (new>a && old<b) return 1;
+
+	b = (uintptr_t)&b;
+	a = b > len ? b-len : 0;
+	if (new>a && old<b) return 1;
+
+	return 0;
+}
+
+/* Expand the heap in-place if brk can be used, or otherwise via mmap,
+ * using an exponential lower bound on growth by mmap to make
+ * fragmentation asymptotically irrelevant. The size argument is both
+ * an input and an output, since the caller needs to know the size
+ * allocated, which will be larger than requested due to page alignment
+ * and mmap minimum size rules. The caller is responsible for locking
+ * to prevent concurrent calls. */
+
+static void *__expand_heap(size_t *pn)
+{
+	static uintptr_t brk;
+	static unsigned mmap_step;
+	size_t n = *pn;
+
+	if (n > SIZE_MAX/2 - PAGE_SIZE) {
+		errno = ENOMEM;
+		return 0;
+	}
+	n += -n & PAGE_SIZE-1;
+
+	if (!brk) {
+		brk = __syscall(SYS_brk, 0);
+		brk += -brk & PAGE_SIZE-1;
+	}
+
+	if (n < SIZE_MAX-brk && !traverses_stack_p(brk, brk+n)
+	    && __syscall(SYS_brk, brk+n)==brk+n) {
+		*pn = n;
+		brk += n;
+		return (void *)(brk-n);
+	}
+
+	size_t min = (size_t)PAGE_SIZE << mmap_step/2;
+	if (n < min) n = min;
+	void *area = __mmap(0, n, PROT_READ|PROT_WRITE,
+		MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+	if (area == MAP_FAILED) return 0;
+	*pn = n;
+	mmap_step++;
+	return area;
+}
+
 static struct chunk *expand_heap(size_t n)
 {
-	static int heap_lock[2];
 	static void *end;
 	void *p;
 	struct chunk *w;
@@ -135,13 +204,8 @@ static struct chunk *expand_heap(size_t n)
 	 * we need room for an extra zero-sized sentinel chunk. */
 	n += SIZE_ALIGN;
 
-	lock(heap_lock);
-
 	p = __expand_heap(&n);
-	if (!p) {
-		unlock(heap_lock);
-		return 0;
-	}
+	if (!p) return 0;
 
 	/* If not just expanding existing space, we need to make a
 	 * new sentinel chunk below the allocated space. */
@@ -164,8 +228,6 @@ static struct chunk *expand_heap(size_t n)
 	w = MEM_TO_CHUNK(p);
 	w->csize = n | C_INUSE;
 
-	unlock(heap_lock);
-
 	return w;
 }
 
@@ -195,96 +257,44 @@ static void unbin(struct chunk *c, int i)
 	NEXT_CHUNK(c)->psize |= C_INUSE;
 }
 
-static int alloc_fwd(struct chunk *c)
+static void bin_chunk(struct chunk *self, int i)
 {
-	int i;
-	size_t k;
-	while (!((k=c->csize) & C_INUSE)) {
-		i = bin_index(k);
-		lock_bin(i);
-		if (c->csize == k) {
-			unbin(c, i);
-			unlock_bin(i);
-			return 1;
-		}
-		unlock_bin(i);
-	}
-	return 0;
-}
-
-static int alloc_rev(struct chunk *c)
-{
-	int i;
-	size_t k;
-	while (!((k=c->psize) & C_INUSE)) {
-		i = bin_index(k);
-		lock_bin(i);
-		if (c->psize == k) {
-			unbin(PREV_CHUNK(c), i);
-			unlock_bin(i);
-			return 1;
-		}
-		unlock_bin(i);
-	}
-	return 0;
+	self->next = BIN_TO_CHUNK(i);
+	self->prev = mal.bins[i].tail;
+	self->next->prev = self;
+	self->prev->next = self;
+	if (self->prev == BIN_TO_CHUNK(i))
+		a_or_64(&mal.binmap, 1ULL<<i);
 }
 
-
-/* pretrim - trims a chunk _prior_ to removing it from its bin.
- * Must be called with i as the ideal bin for size n, j the bin
- * for the _free_ chunk self, and bin j locked. */
-static int pretrim(struct chunk *self, size_t n, int i, int j)
+static void trim(struct chunk *self, size_t n)
 {
-	size_t n1;
+	size_t n1 = CHUNK_SIZE(self);
 	struct chunk *next, *split;
 
-	/* We cannot pretrim if it would require re-binning. */
-	if (j < 40) return 0;
-	if (j < i+3) {
-		if (j != 63) return 0;
-		n1 = CHUNK_SIZE(self);
-		if (n1-n <= MMAP_THRESHOLD) return 0;
-	} else {
-		n1 = CHUNK_SIZE(self);
-	}
-	if (bin_index(n1-n) != j) return 0;
+	if (n >= n1 - DONTCARE) return;
 
 	next = NEXT_CHUNK(self);
 	split = (void *)((char *)self + n);
 
-	split->prev = self->prev;
-	split->next = self->next;
-	split->prev->next = split;
-	split->next->prev = split;
 	split->psize = n | C_INUSE;
 	split->csize = n1-n;
 	next->psize = n1-n;
 	self->csize = n | C_INUSE;
-	return 1;
-}
 
-static void trim(struct chunk *self, size_t n)
-{
-	size_t n1 = CHUNK_SIZE(self);
-	struct chunk *next, *split;
+	int i = bin_index(n1-n);
+	lock_bin(i);
 
-	if (n >= n1 - DONTCARE) return;
+	bin_chunk(split, i);
 
-	next = NEXT_CHUNK(self);
-	split = (void *)((char *)self + n);
-
-	split->psize = n | C_INUSE;
-	split->csize = n1-n | C_INUSE;
-	next->psize = n1-n | C_INUSE;
-	self->csize = n | C_INUSE;
-
-	__bin_chunk(split);
+	unlock_bin(i);
 }
 
 void *malloc(size_t n)
 {
 	struct chunk *c;
 	int i, j;
+	uint64_t mask;
 
 	if (adjust_size(&n) < 0) return 0;
 
@@ -300,70 +310,43 @@ void *malloc(size_t n)
 	}
 
 	i = bin_index_up(n);
-	for (;;) {
-		uint64_t mask = mal.binmap & -(1ULL<<i);
-		if (!mask) {
-			c = expand_heap(n);
-			if (!c) return 0;
-			if (alloc_rev(c)) {
-				struct chunk *x = c;
-				c = PREV_CHUNK(c);
-				NEXT_CHUNK(x)->psize = c->csize =
-					x->csize + CHUNK_SIZE(c);
-			}
-			break;
+	if (i<63 && (mal.binmap & (1ULL<<i))) {
+		lock_bin(i);
+		c = mal.bins[i].head;
+		if (c != BIN_TO_CHUNK(i) && CHUNK_SIZE(c)-n <= DONTCARE) {
+			unbin(c, i);
+			unlock_bin(i);
+			return CHUNK_TO_MEM(c);
 		}
+		unlock_bin(i);
+	}
+	lock(mal.split_merge_lock);
+	for (mask = mal.binmap & -(1ULL<<i); mask; mask -= (mask&-mask)) {
 		j = first_set(mask);
 		lock_bin(j);
 		c = mal.bins[j].head;
 		if (c != BIN_TO_CHUNK(j)) {
-			if (!pretrim(c, n, i, j)) unbin(c, j);
+			unbin(c, j);
 			unlock_bin(j);
 			break;
 		}
 		unlock_bin(j);
 	}
-
-	/* Now patch up in case we over-allocated */
+	if (!mask) {
+		c = expand_heap(n);
+		if (!c) {
+			unlock(mal.split_merge_lock);
+			return 0;
+		}
+	}
 	trim(c, n);
-
+	unlock(mal.split_merge_lock);
 	return CHUNK_TO_MEM(c);
 }
 
-static size_t mal0_clear(char *p, size_t pagesz, size_t n)
+int __malloc_allzerop(void *p)
 {
-#ifdef __GNUC__
-	typedef uint64_t __attribute__((__may_alias__)) T;
-#else
-	typedef unsigned char T;
-#endif
-	char *pp = p + n;
-	size_t i = (uintptr_t)pp & (pagesz - 1);
-	for (;;) {
-		pp = memset(pp - i, 0, i);
-		if (pp - p < pagesz) return pp - p;
-		for (i = pagesz; i; i -= 2*sizeof(T), pp -= 2*sizeof(T))
-		        if (((T *)pp)[-1] | ((T *)pp)[-2])
-				break;
-	}
-}
-
-void *calloc(size_t m, size_t n)
-{
-	if (n && m > (size_t)-1/n) {
-		errno = ENOMEM;
-		return 0;
-	}
-	n *= m;
-	void *p = malloc(n);
-	if (!p) return p;
-	if (!__malloc_replaced) {
-		if (IS_MMAPPED(MEM_TO_CHUNK(p)))
-			return p;
-		if (n >= PAGE_SIZE)
-			n = mal0_clear(p, PAGE_SIZE, n);
-	}
-	return memset(p, 0, n);
+	return IS_MMAPPED(MEM_TO_CHUNK(p));
 }
 
 void *realloc(void *p, size_t n)
@@ -379,6 +362,8 @@ void *realloc(void *p, size_t n)
 	self = MEM_TO_CHUNK(p);
 	n1 = n0 = CHUNK_SIZE(self);
 
+	if (n<=n0 && n0-n<=DONTCARE) return p;
+
 	if (IS_MMAPPED(self)) {
 		size_t extra = self->psize;
 		char *base = (char *)self - extra;
@@ -405,34 +390,43 @@ void *realloc(void *p, size_t n)
 	/* Crash on corrupted footer (likely from buffer overflow) */
 	if (next->psize != self->csize) a_crash();
 
-	/* Merge adjacent chunks if we need more space. This is not
-	 * a waste of time even if we fail to get enough space, because our
-	 * subsequent call to free would otherwise have to do the merge. */
-	if (n > n1 && alloc_fwd(next)) {
-		n1 += CHUNK_SIZE(next);
-		next = NEXT_CHUNK(next);
-	}
-	/* FIXME: find what's wrong here and reenable it..? */
-	if (0 && n > n1 && alloc_rev(self)) {
-		self = PREV_CHUNK(self);
-		n1 += CHUNK_SIZE(self);
+	if (n < n0) {
+		int i = bin_index_up(n);
+		int j = bin_index(n0);
+		if (i<j && (mal.binmap & (1ULL << i)))
+			goto copy_realloc;
+		struct chunk *split = (void *)((char *)self + n);
+		self->csize = split->psize = n | C_INUSE;
+		split->csize = next->psize = n0-n | C_INUSE;
+		__bin_chunk(split);
+		return CHUNK_TO_MEM(self);
 	}
-	self->csize = n1 | C_INUSE;
-	next->psize = n1 | C_INUSE;
 
-	/* If we got enough space, split off the excess and return */
-	if (n <= n1) {
-		//memmove(CHUNK_TO_MEM(self), p, n0-OVERHEAD);
-		trim(self, n);
-		return CHUNK_TO_MEM(self);
+	lock(mal.split_merge_lock);
+
+	size_t nsize = next->csize & C_INUSE ? 0 : CHUNK_SIZE(next);
+	if (n0+nsize >= n) {
+		int i = bin_index(nsize);
+		lock_bin(i);
+		if (!(next->csize & C_INUSE)) {
+			unbin(next, i);
+			unlock_bin(i);
+			next = NEXT_CHUNK(next);
+			self->csize = next->psize = n0+nsize | C_INUSE;
+			trim(self, n);
+			unlock(mal.split_merge_lock);
+			return CHUNK_TO_MEM(self);
+		}
+		unlock_bin(i);
 	}
+	unlock(mal.split_merge_lock);
 
 copy_realloc:
 	/* As a last resort, allocate a new chunk and copy to it. */
 	new = malloc(n-OVERHEAD);
 	if (!new) return 0;
 copy_free_ret:
-	memcpy(new, p, n0-OVERHEAD);
+	memcpy(new, p, (n<n0 ? n : n0) - OVERHEAD);
 	free(CHUNK_TO_MEM(self));
 	return new;
 }
@@ -440,67 +434,61 @@ copy_free_ret:
 void __bin_chunk(struct chunk *self)
 {
 	struct chunk *next = NEXT_CHUNK(self);
-	size_t final_size, new_size, size;
-	int reclaim=0;
-	int i;
-
-	final_size = new_size = CHUNK_SIZE(self);
 
 	/* Crash on corrupted footer (likely from buffer overflow) */
 	if (next->psize != self->csize) a_crash();
 
-	for (;;) {
-		if (self->psize & next->csize & C_INUSE) {
-			self->csize = final_size | C_INUSE;
-			next->psize = final_size | C_INUSE;
-			i = bin_index(final_size);
-			lock_bin(i);
-			lock(mal.free_lock);
-			if (self->psize & next->csize & C_INUSE)
-				break;
-			unlock(mal.free_lock);
-			unlock_bin(i);
-		}
+	lock(mal.split_merge_lock);
 
-		if (alloc_rev(self)) {
-			self = PREV_CHUNK(self);
-			size = CHUNK_SIZE(self);
-			final_size += size;
-			if (new_size+size > RECLAIM && (new_size+size^size) > size)
-				reclaim = 1;
-		}
+	size_t osize = CHUNK_SIZE(self), size = osize;
+
+	/* Since we hold split_merge_lock, only transition from free to
+	 * in-use can race; in-use to free is impossible */
+	size_t psize = self->psize & C_INUSE ? 0 : CHUNK_PSIZE(self);
+	size_t nsize = next->csize & C_INUSE ? 0 : CHUNK_SIZE(next);
 
-		if (alloc_fwd(next)) {
-			size = CHUNK_SIZE(next);
-			final_size += size;
-			if (new_size+size > RECLAIM && (new_size+size^size) > size)
-				reclaim = 1;
+	if (psize) {
+		int i = bin_index(psize);
+		lock_bin(i);
+		if (!(self->psize & C_INUSE)) {
+			struct chunk *prev = PREV_CHUNK(self);
+			unbin(prev, i);
+			self = prev;
+			size += psize;
+		}
+		unlock_bin(i);
+	}
+	if (nsize) {
+		int i = bin_index(nsize);
+		lock_bin(i);
+		if (!(next->csize & C_INUSE)) {
+			unbin(next, i);
 			next = NEXT_CHUNK(next);
+			size += nsize;
 		}
+		unlock_bin(i);
 	}
 
-	if (!(mal.binmap & 1ULL<<i))
-		a_or_64(&mal.binmap, 1ULL<<i);
+	int i = bin_index(size);
+	lock_bin(i);
 
-	self->csize = final_size;
-	next->psize = final_size;
-	unlock(mal.free_lock);
-
-	self->next = BIN_TO_CHUNK(i);
-	self->prev = mal.bins[i].tail;
-	self->next->prev = self;
-	self->prev->next = self;
+	self->csize = size;
+	next->psize = size;
+	bin_chunk(self, i);
+	unlock(mal.split_merge_lock);
 
 	/* Replace middle of large chunks with fresh zero pages */
-	if (reclaim) {
+	if (size > RECLAIM && (size^(size-osize)) > size-osize) {
 		uintptr_t a = (uintptr_t)self + SIZE_ALIGN+PAGE_SIZE-1 & -PAGE_SIZE;
 		uintptr_t b = (uintptr_t)next - SIZE_ALIGN & -PAGE_SIZE;
+		int e = errno;
 #if 1
 		__madvise((void *)a, b-a, MADV_DONTNEED);
 #else
 		__mmap((void *)a, b-a, PROT_READ|PROT_WRITE,
 			MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0);
 #endif
+		errno = e;
 	}
 
 	unlock_bin(i);
@@ -513,7 +501,9 @@ static void unmap_chunk(struct chunk *self)
 	size_t len = CHUNK_SIZE(self) + extra;
 	/* Crash on double free */
 	if (extra & 1) a_crash();
+	int e = errno;
 	__munmap(base, len);
+	errno = e;
 }
 
 void free(void *p)
@@ -546,3 +536,21 @@ void __malloc_donate(char *start, char *end)
 	c->csize = n->psize = C_INUSE | (end-start);
 	__bin_chunk(c);
 }
+
+void __malloc_atfork(int who)
+{
+	if (who<0) {
+		lock(mal.split_merge_lock);
+		for (int i=0; i<64; i++)
+			lock(mal.bins[i].lock);
+	} else if (!who) {
+		for (int i=0; i<64; i++)
+			unlock(mal.bins[i].lock);
+		unlock(mal.split_merge_lock);
+	} else {
+		for (int i=0; i<64; i++)
+			mal.bins[i].lock[0] = mal.bins[i].lock[1] = 0;
+		mal.split_merge_lock[1] = 0;
+		mal.split_merge_lock[0] = 0;
+	}
+}
diff --git a/src/internal/malloc_impl.h b/src/malloc/oldmalloc/malloc_impl.h
index 59785a7f..e1cf4774 100644
--- a/src/internal/malloc_impl.h
+++ b/src/malloc/oldmalloc/malloc_impl.h
@@ -2,12 +2,7 @@
 #define MALLOC_IMPL_H
 
 #include <sys/mman.h>
-
-hidden void *__expand_heap(size_t *);
-
-hidden void __malloc_donate(char *, char *);
-
-hidden void *__memalign(size_t, size_t);
+#include "dynlink.h"
 
 struct chunk {
 	size_t psize, csize;
@@ -41,6 +36,4 @@ struct bin {
 
 hidden void __bin_chunk(struct chunk *);
 
-hidden extern int __malloc_replaced;
-
 #endif
diff --git a/src/malloc/malloc_usable_size.c b/src/malloc/oldmalloc/malloc_usable_size.c
index 672b518a..672b518a 100644
--- a/src/malloc/malloc_usable_size.c
+++ b/src/malloc/oldmalloc/malloc_usable_size.c
diff --git a/src/malloc/posix_memalign.c b/src/malloc/posix_memalign.c
index 2ea8bd8a..ad4d8f47 100644
--- a/src/malloc/posix_memalign.c
+++ b/src/malloc/posix_memalign.c
@@ -1,11 +1,10 @@
 #include <stdlib.h>
 #include <errno.h>
-#include "malloc_impl.h"
 
 int posix_memalign(void **res, size_t align, size_t len)
 {
 	if (align < sizeof(void *)) return EINVAL;
-	void *mem = __memalign(align, len);
+	void *mem = aligned_alloc(align, len);
 	if (!mem) return errno;
 	*res = mem;
 	return 0;
diff --git a/src/malloc/realloc.c b/src/malloc/realloc.c
new file mode 100644
index 00000000..fb0e8b7c
--- /dev/null
+++ b/src/malloc/realloc.c
@@ -0,0 +1,6 @@
+#include <stdlib.h>
+
+void *realloc(void *p, size_t n)
+{
+	return __libc_realloc(p, n);
+}
diff --git a/src/malloc/reallocarray.c b/src/malloc/reallocarray.c
new file mode 100644
index 00000000..4a6ebe46
--- /dev/null
+++ b/src/malloc/reallocarray.c
@@ -0,0 +1,13 @@
+#define _BSD_SOURCE
+#include <errno.h>
+#include <stdlib.h>
+
+void *reallocarray(void *ptr, size_t m, size_t n)
+{
+	if (n && m > -1 / n) {
+		errno = ENOMEM;
+		return 0;
+	}
+
+	return realloc(ptr, m * n);
+}
diff --git a/src/malloc/replaced.c b/src/malloc/replaced.c
new file mode 100644
index 00000000..07fce61e
--- /dev/null
+++ b/src/malloc/replaced.c
@@ -0,0 +1,4 @@
+#include "dynlink.h"
+
+int __malloc_replaced;
+int __aligned_alloc_replaced;
diff --git a/src/math/__expo2.c b/src/math/__expo2.c
index 740ac680..248f052b 100644
--- a/src/math/__expo2.c
+++ b/src/math/__expo2.c
@@ -5,12 +5,13 @@ static const int k = 2043;
 static const double kln2 = 0x1.62066151add8bp+10;
 
 /* exp(x)/2 for x >= log(DBL_MAX), slightly better than 0.5*exp(x/2)*exp(x/2) */
-double __expo2(double x)
+double __expo2(double x, double sign)
 {
 	double scale;
 
 	/* note that k is odd and scale*scale overflows */
 	INSERT_WORDS(scale, (uint32_t)(0x3ff + k/2) << 20, 0);
 	/* exp(x - k ln2) * 2**(k-1) */
-	return exp(x - kln2) * scale * scale;
+	/* in directed rounding correct sign before rounding or overflow is important */
+	return exp(x - kln2) * (sign * scale) * scale;
 }
diff --git a/src/math/__expo2f.c b/src/math/__expo2f.c
index 5163e418..538eb09c 100644
--- a/src/math/__expo2f.c
+++ b/src/math/__expo2f.c
@@ -5,12 +5,13 @@ static const int k = 235;
 static const float kln2 = 0x1.45c778p+7f;
 
 /* expf(x)/2 for x >= log(FLT_MAX), slightly better than 0.5f*expf(x/2)*expf(x/2) */
-float __expo2f(float x)
+float __expo2f(float x, float sign)
 {
 	float scale;
 
 	/* note that k is odd and scale*scale overflows */
 	SET_FLOAT_WORD(scale, (uint32_t)(0x7f + k/2) << 23);
 	/* exp(x - k ln2) * 2**(k-1) */
-	return expf(x - kln2) * scale * scale;
+	/* in directed rounding correct sign before rounding or overflow is important */
+	return expf(x - kln2) * (sign * scale) * scale;
 }
diff --git a/src/math/__math_invalidl.c b/src/math/__math_invalidl.c
new file mode 100644
index 00000000..1fca99de
--- /dev/null
+++ b/src/math/__math_invalidl.c
@@ -0,0 +1,9 @@
+#include <float.h>
+#include "libm.h"
+
+#if LDBL_MANT_DIG != DBL_MANT_DIG
+long double __math_invalidl(long double x)
+{
+	return (x - x) / (x - x);
+}
+#endif
diff --git a/src/math/__rem_pio2.c b/src/math/__rem_pio2.c
index d403f81c..dcf672fb 100644
--- a/src/math/__rem_pio2.c
+++ b/src/math/__rem_pio2.c
@@ -36,6 +36,7 @@
  */
 static const double
 toint   = 1.5/EPS,
+pio4    = 0x1.921fb54442d18p-1,
 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 pio2_1  = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
 pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
@@ -117,11 +118,23 @@ int __rem_pio2(double x, double *y)
 	}
 	if (ix < 0x413921fb) {  /* |x| ~< 2^20*(pi/2), medium size */
 medium:
-		/* rint(x/(pi/2)), Assume round-to-nearest. */
+		/* rint(x/(pi/2)) */
 		fn = (double_t)x*invpio2 + toint - toint;
 		n = (int32_t)fn;
 		r = x - fn*pio2_1;
 		w = fn*pio2_1t;  /* 1st round, good to 85 bits */
+		/* Matters with directed rounding. */
+		if (predict_false(r - w < -pio4)) {
+			n--;
+			fn--;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		} else if (predict_false(r - w > pio4)) {
+			n++;
+			fn++;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		}
 		y[0] = r - w;
 		u.f = y[0];
 		ey = u.i>>52 & 0x7ff;
diff --git a/src/math/__rem_pio2f.c b/src/math/__rem_pio2f.c
index 4473c1c4..e6765643 100644
--- a/src/math/__rem_pio2f.c
+++ b/src/math/__rem_pio2f.c
@@ -35,6 +35,7 @@
  */
 static const double
 toint   = 1.5/EPS,
+pio4    = 0x1.921fb6p-1,
 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 pio2_1  = 1.57079631090164184570e+00, /* 0x3FF921FB, 0x50000000 */
 pio2_1t = 1.58932547735281966916e-08; /* 0x3E5110b4, 0x611A6263 */
@@ -50,10 +51,20 @@ int __rem_pio2f(float x, double *y)
 	ix = u.i & 0x7fffffff;
 	/* 25+53 bit pi is good enough for medium size */
 	if (ix < 0x4dc90fdb) {  /* |x| ~< 2^28*(pi/2), medium size */
-		/* Use a specialized rint() to get fn.  Assume round-to-nearest. */
+		/* Use a specialized rint() to get fn. */
 		fn = (double_t)x*invpio2 + toint - toint;
 		n  = (int32_t)fn;
 		*y = x - fn*pio2_1 - fn*pio2_1t;
+		/* Matters with directed rounding. */
+		if (predict_false(*y < -pio4)) {
+			n--;
+			fn--;
+			*y = x - fn*pio2_1 - fn*pio2_1t;
+		} else if (predict_false(*y > pio4)) {
+			n++;
+			fn++;
+			*y = x - fn*pio2_1 - fn*pio2_1t;
+		}
 		return n;
 	}
 	if(ix>=0x7f800000) {  /* x is inf or NaN */
diff --git a/src/math/__rem_pio2l.c b/src/math/__rem_pio2l.c
index 77255bd8..236b2def 100644
--- a/src/math/__rem_pio2l.c
+++ b/src/math/__rem_pio2l.c
@@ -44,6 +44,7 @@ pio2_1 =  1.57079632679597125389e+00, /* 0x3FF921FB, 0x54444000 */
 pio2_2 = -1.07463465549783099519e-12, /* -0x12e7b967674000.0p-92 */
 pio2_3 =  6.36831716351370313614e-25; /*  0x18a2e037074000.0p-133 */
 static const long double
+pio4    =  0x1.921fb54442d1846ap-1L,
 invpio2 =  6.36619772367581343076e-01L, /*  0xa2f9836e4e44152a.0p-64 */
 pio2_1t = -1.07463465549719416346e-12L, /* -0x973dcb3b399d747f.0p-103 */
 pio2_2t =  6.36831716351095013979e-25L, /*  0xc51701b839a25205.0p-144 */
@@ -57,6 +58,7 @@ pio2_3t = -2.75299651904407171810e-37L; /* -0xbb5bf6c7ddd660ce.0p-185 */
 #define NX 5
 #define NY 3
 static const long double
+pio4    =  0x1.921fb54442d18469898cc51701b8p-1L,
 invpio2 =  6.3661977236758134307553505349005747e-01L,	/*  0x145f306dc9c882a53f84eafa3ea6a.0p-113 */
 pio2_1  =  1.5707963267948966192292994253909555e+00L,	/*  0x1921fb54442d18469800000000000.0p-112 */
 pio2_1t =  2.0222662487959507323996846200947577e-21L,	/*  0x13198a2e03707344a4093822299f3.0p-181 */
@@ -76,11 +78,23 @@ int __rem_pio2l(long double x, long double *y)
 	u.f = x;
 	ex = u.i.se & 0x7fff;
 	if (SMALL(u)) {
-		/* rint(x/(pi/2)), Assume round-to-nearest. */
+		/* rint(x/(pi/2)) */
 		fn = x*invpio2 + toint - toint;
 		n = QUOBITS(fn);
 		r = x-fn*pio2_1;
 		w = fn*pio2_1t;  /* 1st round good to 102/180 bits (ld80/ld128) */
+		/* Matters with directed rounding. */
+		if (predict_false(r - w < -pio4)) {
+			n--;
+			fn--;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		} else if (predict_false(r - w > pio4)) {
+			n++;
+			fn++;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		}
 		y[0] = r-w;
 		u.f = y[0];
 		ey = u.i.se & 0x7fff;
diff --git a/src/math/acoshf.c b/src/math/acoshf.c
index 8a4ec4d5..b773d48e 100644
--- a/src/math/acoshf.c
+++ b/src/math/acoshf.c
@@ -15,12 +15,12 @@ float acoshf(float x)
 	uint32_t a = u.i & 0x7fffffff;
 
 	if (a < 0x3f800000+(1<<23))
-		/* |x| < 2, invalid if x < 1 or nan */
+		/* |x| < 2, invalid if x < 1 */
 		/* up to 2ulp error in [1,1.125] */
 		return log1pf(x-1 + sqrtf((x-1)*(x-1)+2*(x-1)));
-	if (a < 0x3f800000+(12<<23))
-		/* |x| < 0x1p12 */
+	if (u.i < 0x3f800000+(12<<23))
+		/* 2 <= x < 0x1p12 */
 		return logf(2*x - 1/(x+sqrtf(x*x-1)));
-	/* x >= 0x1p12 */
+	/* x >= 0x1p12 or x <= -2 or nan */
 	return logf(x) + 0.693147180559945309417232121458176568f;
 }
diff --git a/src/math/acoshl.c b/src/math/acoshl.c
index 8d4b43f6..943cec17 100644
--- a/src/math/acoshl.c
+++ b/src/math/acoshl.c
@@ -10,14 +10,18 @@ long double acoshl(long double x)
 long double acoshl(long double x)
 {
 	union ldshape u = {x};
-	int e = u.i.se & 0x7fff;
+	int e = u.i.se;
 
 	if (e < 0x3fff + 1)
-		/* |x| < 2, invalid if x < 1 or nan */
+		/* 0 <= x < 2, invalid if x < 1 */
 		return log1pl(x-1 + sqrtl((x-1)*(x-1)+2*(x-1)));
 	if (e < 0x3fff + 32)
-		/* |x| < 0x1p32 */
+		/* 2 <= x < 0x1p32 */
 		return logl(2*x - 1/(x+sqrtl(x*x-1)));
+	if (e & 0x8000)
+		/* x < 0 or x = -0, invalid */
+		return (x - x) / (x - x);
+	/* 0x1p32 <= x or nan */
 	return logl(x) + 0.693147180559945309417232121458176568L;
 }
 #elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384
diff --git a/src/math/arm/fabs.c b/src/math/arm/fabs.c
index f890520a..6e1d367d 100644
--- a/src/math/arm/fabs.c
+++ b/src/math/arm/fabs.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if __ARM_PCS_VFP
+#if __ARM_PCS_VFP && __ARM_FP&8
 
 double fabs(double x)
 {
diff --git a/src/math/arm/sqrt.c b/src/math/arm/sqrt.c
index 874af960..567e2e91 100644
--- a/src/math/arm/sqrt.c
+++ b/src/math/arm/sqrt.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if __ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__)
+#if (__ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__)) && (__ARM_FP&8)
 
 double sqrt(double x)
 {
diff --git a/src/math/cosh.c b/src/math/cosh.c
index 100f8231..490c15fb 100644
--- a/src/math/cosh.c
+++ b/src/math/cosh.c
@@ -35,6 +35,6 @@ double cosh(double x)
 
 	/* |x| > log(DBL_MAX) or nan */
 	/* note: the result is stored to handle overflow */
-	t = __expo2(x);
+	t = __expo2(x, 1.0);
 	return t;
 }
diff --git a/src/math/coshf.c b/src/math/coshf.c
index b09f2ee5..e739cff9 100644
--- a/src/math/coshf.c
+++ b/src/math/coshf.c
@@ -28,6 +28,6 @@ float coshf(float x)
 	}
 
 	/* |x| > log(FLT_MAX) or nan */
-	t = __expo2f(x);
+	t = __expo2f(x, 1.0f);
 	return t;
 }
diff --git a/src/math/expm1f.c b/src/math/expm1f.c
index 297e0b44..09a41afe 100644
--- a/src/math/expm1f.c
+++ b/src/math/expm1f.c
@@ -16,7 +16,6 @@
 #include "libm.h"
 
 static const float
-o_threshold = 8.8721679688e+01, /* 0x42b17180 */
 ln2_hi      = 6.9313812256e-01, /* 0x3f317180 */
 ln2_lo      = 9.0580006145e-06, /* 0x3717f7d1 */
 invln2      = 1.4426950216e+00, /* 0x3fb8aa3b */
@@ -41,7 +40,7 @@ float expm1f(float x)
 			return x;
 		if (sign)
 			return -1;
-		if (x > o_threshold) {
+		if (hx > 0x42b17217) { /* x > log(FLT_MAX) */
 			x *= 0x1p127f;
 			return x;
 		}
diff --git a/src/math/fma.c b/src/math/fma.c
index 0c6f90c9..adfadca8 100644
--- a/src/math/fma.c
+++ b/src/math/fma.c
@@ -53,7 +53,7 @@ double fma(double x, double y, double z)
 		return x*y + z;
 	if (nz.e >= ZEROINFNAN) {
 		if (nz.e > ZEROINFNAN) /* z==0 */
-			return x*y + z;
+			return x*y;
 		return z;
 	}
 
diff --git a/src/math/fmaf.c b/src/math/fmaf.c
index 80f5cd8a..7c65acf1 100644
--- a/src/math/fmaf.c
+++ b/src/math/fmaf.c
@@ -77,17 +77,16 @@ float fmaf(float x, float y, float z)
 	 * If result is inexact, and exactly halfway between two float values,
 	 * we need to adjust the low-order bit in the direction of the error.
 	 */
-#ifdef FE_TOWARDZERO
-	fesetround(FE_TOWARDZERO);
-#endif
-	volatile double vxy = xy;  /* XXX work around gcc CSE bug */
-	double adjusted_result = vxy + z;
-	fesetround(FE_TONEAREST);
-	if (result == adjusted_result) {
-		u.f = adjusted_result;
+	double err;
+	int neg = u.i >> 63;
+	if (neg == (z > xy))
+		err = xy - result + z;
+	else
+		err = z - result + xy;
+	if (neg == (err < 0))
 		u.i++;
-		adjusted_result = u.f;
-	}
-	z = adjusted_result;
+	else
+		u.i--;
+	z = u.f;
 	return z;
 }
diff --git a/src/math/i386/exp.c b/src/math/i386/exp.c
deleted file mode 100644
index 11282284..00000000
--- a/src/math/i386/exp.c
+++ /dev/null
@@ -1 +0,0 @@
-// see exp_ld.s
diff --git a/src/math/i386/exp2.s b/src/math/i386/exp2.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/exp2.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/exp2f.s b/src/math/i386/exp2f.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/exp2f.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/exp2l.s b/src/math/i386/exp2l.s
index f335a3e5..8125761d 100644
--- a/src/math/i386/exp2l.s
+++ b/src/math/i386/exp2l.s
@@ -1 +1 @@
-# see exp.s
+# see exp_ld.s
diff --git a/src/math/i386/exp_ld.s b/src/math/i386/exp_ld.s
index df87c497..99cba01f 100644
--- a/src/math/i386/exp_ld.s
+++ b/src/math/i386/exp_ld.s
@@ -1,35 +1,8 @@
-.global expm1f
-.type expm1f,@function
-expm1f:
-	flds 4(%esp)
-	mov 4(%esp),%eax
-	add %eax,%eax
-	cmp $0x01000000,%eax
-	jae 1f
-		# subnormal x, return x with underflow
-	fld %st(0)
-	fmul %st(1)
-	fstps 4(%esp)
-	ret
-
 .global expm1l
 .type expm1l,@function
 expm1l:
 	fldt 4(%esp)
-	jmp 1f
-
-.global expm1
-.type expm1,@function
-expm1:
-	fldl 4(%esp)
-	mov 8(%esp),%eax
-	add %eax,%eax
-	cmp $0x00200000,%eax
-	jae 1f
-		# subnormal x, return x with underflow
-	fsts 4(%esp)
-	ret
-1:	fldl2e
+	fldl2e
 	fmulp
 	mov $0xc2820000,%eax
 	push %eax
@@ -59,12 +32,6 @@ expm1:
 	fsubrp
 	ret
 
-.global exp2f
-.type exp2f,@function
-exp2f:
-	flds 4(%esp)
-	jmp 1f
-
 .global exp2l
 .global __exp2l
 .hidden __exp2l
@@ -72,26 +39,6 @@ exp2f:
 exp2l:
 __exp2l:
 	fldt 4(%esp)
-	jmp 1f
-
-.global expf
-.type expf,@function
-expf:
-	flds 4(%esp)
-	jmp 2f
-
-.global exp
-.type exp,@function
-exp:
-	fldl 4(%esp)
-2:	fldl2e
-	fmulp
-	jmp 1f
-
-.global exp2
-.type exp2,@function
-exp2:
-	fldl 4(%esp)
 1:	sub $12,%esp
 	fld %st(0)
 	fstpt (%esp)
diff --git a/src/math/i386/expf.s b/src/math/i386/expf.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/expf.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/expm1.s b/src/math/i386/expm1.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/expm1.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/expm1f.s b/src/math/i386/expm1f.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/expm1f.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/expm1l.s b/src/math/i386/expm1l.s
index f335a3e5..8125761d 100644
--- a/src/math/i386/expm1l.s
+++ b/src/math/i386/expm1l.s
@@ -1 +1 @@
-# see exp.s
+# see exp_ld.s
diff --git a/src/math/i386/fabs.c b/src/math/i386/fabs.c
new file mode 100644
index 00000000..39672786
--- /dev/null
+++ b/src/math/i386/fabs.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+double fabs(double x)
+{
+	__asm__ ("fabs" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/fabs.s b/src/math/i386/fabs.s
deleted file mode 100644
index d66ea9a1..00000000
--- a/src/math/i386/fabs.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global fabs
-.type fabs,@function
-fabs:
-	fldl 4(%esp)
-	fabs
-	ret
diff --git a/src/math/i386/fabsf.c b/src/math/i386/fabsf.c
new file mode 100644
index 00000000..d882eee3
--- /dev/null
+++ b/src/math/i386/fabsf.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+float fabsf(float x)
+{
+	__asm__ ("fabs" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/fabsf.s b/src/math/i386/fabsf.s
deleted file mode 100644
index a981c422..00000000
--- a/src/math/i386/fabsf.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global fabsf
-.type fabsf,@function
-fabsf:
-	flds 4(%esp)
-	fabs
-	ret
diff --git a/src/math/i386/fabsl.c b/src/math/i386/fabsl.c
new file mode 100644
index 00000000..cc1c9ed9
--- /dev/null
+++ b/src/math/i386/fabsl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double fabsl(long double x)
+{
+	__asm__ ("fabs" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/fabsl.s b/src/math/i386/fabsl.s
deleted file mode 100644
index ceef9e4c..00000000
--- a/src/math/i386/fabsl.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global fabsl
-.type fabsl,@function
-fabsl:
-	fldt 4(%esp)
-	fabs
-	ret
diff --git a/src/math/i386/fmod.c b/src/math/i386/fmod.c
new file mode 100644
index 00000000..ea0c58d9
--- /dev/null
+++ b/src/math/i386/fmod.c
@@ -0,0 +1,10 @@
+#include <math.h>
+
+double fmod(double x, double y)
+{
+	unsigned short fpsr;
+	// fprem does not introduce excess precision into x
+	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/i386/fmod.s b/src/math/i386/fmod.s
deleted file mode 100644
index 2113b3c5..00000000
--- a/src/math/i386/fmod.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global fmod
-.type fmod,@function
-fmod:
-	fldl 12(%esp)
-	fldl 4(%esp)
-1:	fprem
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/fmodf.c b/src/math/i386/fmodf.c
new file mode 100644
index 00000000..90b56ab0
--- /dev/null
+++ b/src/math/i386/fmodf.c
@@ -0,0 +1,10 @@
+#include <math.h>
+
+float fmodf(float x, float y)
+{
+	unsigned short fpsr;
+	// fprem does not introduce excess precision into x
+	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/i386/fmodf.s b/src/math/i386/fmodf.s
deleted file mode 100644
index e04e2a56..00000000
--- a/src/math/i386/fmodf.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global fmodf
-.type fmodf,@function
-fmodf:
-	flds 8(%esp)
-	flds 4(%esp)
-1:	fprem
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/fmodl.c b/src/math/i386/fmodl.c
new file mode 100644
index 00000000..3daeab06
--- /dev/null
+++ b/src/math/i386/fmodl.c
@@ -0,0 +1,9 @@
+#include <math.h>
+
+long double fmodl(long double x, long double y)
+{
+	unsigned short fpsr;
+	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/i386/fmodl.s b/src/math/i386/fmodl.s
deleted file mode 100644
index 0cb3fe9b..00000000
--- a/src/math/i386/fmodl.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global fmodl
-.type fmodl,@function
-fmodl:
-	fldt 16(%esp)
-	fldt 4(%esp)
-1:	fprem
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/llrint.c b/src/math/i386/llrint.c
new file mode 100644
index 00000000..aa400817
--- /dev/null
+++ b/src/math/i386/llrint.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrint(double x)
+{
+	long long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/llrint.s b/src/math/i386/llrint.s
deleted file mode 100644
index 8e89cd91..00000000
--- a/src/math/i386/llrint.s
+++ /dev/null
@@ -1,8 +0,0 @@
-.global llrint
-.type llrint,@function
-llrint:
-	fldl 4(%esp)
-	fistpll 4(%esp)
-	mov 4(%esp),%eax
-	mov 8(%esp),%edx
-	ret
diff --git a/src/math/i386/llrintf.c b/src/math/i386/llrintf.c
new file mode 100644
index 00000000..c41a317b
--- /dev/null
+++ b/src/math/i386/llrintf.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrintf(float x)
+{
+	long long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/llrintf.s b/src/math/i386/llrintf.s
deleted file mode 100644
index aa850c6c..00000000
--- a/src/math/i386/llrintf.s
+++ /dev/null
@@ -1,9 +0,0 @@
-.global llrintf
-.type llrintf,@function
-llrintf:
-	sub $8,%esp
-	flds 12(%esp)
-	fistpll (%esp)
-	pop %eax
-	pop %edx
-	ret
diff --git a/src/math/i386/llrintl.c b/src/math/i386/llrintl.c
new file mode 100644
index 00000000..c439ef28
--- /dev/null
+++ b/src/math/i386/llrintl.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrintl(long double x)
+{
+	long long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/llrintl.s b/src/math/i386/llrintl.s
deleted file mode 100644
index 1cfb56f1..00000000
--- a/src/math/i386/llrintl.s
+++ /dev/null
@@ -1,8 +0,0 @@
-.global llrintl
-.type llrintl,@function
-llrintl:
-	fldt 4(%esp)
-	fistpll 4(%esp)
-	mov 4(%esp),%eax
-	mov 8(%esp),%edx
-	ret
diff --git a/src/math/i386/lrint.c b/src/math/i386/lrint.c
new file mode 100644
index 00000000..89563ab2
--- /dev/null
+++ b/src/math/i386/lrint.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrint(double x)
+{
+	long r;
+	__asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/lrint.s b/src/math/i386/lrint.s
deleted file mode 100644
index 02b83d9f..00000000
--- a/src/math/i386/lrint.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global lrint
-.type lrint,@function
-lrint:
-	fldl 4(%esp)
-	fistpl 4(%esp)
-	mov 4(%esp),%eax
-	ret
diff --git a/src/math/i386/lrintf.c b/src/math/i386/lrintf.c
new file mode 100644
index 00000000..0bbf29de
--- /dev/null
+++ b/src/math/i386/lrintf.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrintf(float x)
+{
+	long r;
+	__asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/lrintf.s b/src/math/i386/lrintf.s
deleted file mode 100644
index 907aac29..00000000
--- a/src/math/i386/lrintf.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global lrintf
-.type lrintf,@function
-lrintf:
-	flds 4(%esp)
-	fistpl 4(%esp)
-	mov 4(%esp),%eax
-	ret
diff --git a/src/math/i386/lrintl.c b/src/math/i386/lrintl.c
new file mode 100644
index 00000000..eb8c0902
--- /dev/null
+++ b/src/math/i386/lrintl.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrintl(long double x)
+{
+	long r;
+	__asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/lrintl.s b/src/math/i386/lrintl.s
deleted file mode 100644
index 3ae05aac..00000000
--- a/src/math/i386/lrintl.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global lrintl
-.type lrintl,@function
-lrintl:
-	fldt 4(%esp)
-	fistpl 4(%esp)
-	mov 4(%esp),%eax
-	ret
diff --git a/src/math/i386/remainder.c b/src/math/i386/remainder.c
new file mode 100644
index 00000000..c083df90
--- /dev/null
+++ b/src/math/i386/remainder.c
@@ -0,0 +1,12 @@
+#include <math.h>
+
+double remainder(double x, double y)
+{
+	unsigned short fpsr;
+	// fprem1 does not introduce excess precision into x
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
+
+weak_alias(remainder, drem);
diff --git a/src/math/i386/remainder.s b/src/math/i386/remainder.s
deleted file mode 100644
index ab1da95d..00000000
--- a/src/math/i386/remainder.s
+++ /dev/null
@@ -1,14 +0,0 @@
-.global remainder
-.type remainder,@function
-remainder:
-.weak drem
-.type drem,@function
-drem:
-	fldl 12(%esp)
-	fldl 4(%esp)
-1:	fprem1
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/remainderf.c b/src/math/i386/remainderf.c
new file mode 100644
index 00000000..280207d2
--- /dev/null
+++ b/src/math/i386/remainderf.c
@@ -0,0 +1,12 @@
+#include <math.h>
+
+float remainderf(float x, float y)
+{
+	unsigned short fpsr;
+	// fprem1 does not introduce excess precision into x
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
+
+weak_alias(remainderf, dremf);
diff --git a/src/math/i386/remainderf.s b/src/math/i386/remainderf.s
deleted file mode 100644
index 6a7378a3..00000000
--- a/src/math/i386/remainderf.s
+++ /dev/null
@@ -1,14 +0,0 @@
-.global remainderf
-.type remainderf,@function
-remainderf:
-.weak dremf
-.type dremf,@function
-dremf:
-	flds 8(%esp)
-	flds 4(%esp)
-1:	fprem1
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/remainderl.c b/src/math/i386/remainderl.c
new file mode 100644
index 00000000..8cf75071
--- /dev/null
+++ b/src/math/i386/remainderl.c
@@ -0,0 +1,9 @@
+#include <math.h>
+
+long double remainderl(long double x, long double y)
+{
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/i386/remainderl.s b/src/math/i386/remainderl.s
deleted file mode 100644
index b41518ed..00000000
--- a/src/math/i386/remainderl.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global remainderl
-.type remainderl,@function
-remainderl:
-	fldt 16(%esp)
-	fldt 4(%esp)
-1:	fprem1
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/rint.c b/src/math/i386/rint.c
new file mode 100644
index 00000000..a5276a60
--- /dev/null
+++ b/src/math/i386/rint.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+double rint(double x)
+{
+	__asm__ ("frndint" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/rint.s b/src/math/i386/rint.s
deleted file mode 100644
index bb99a11c..00000000
--- a/src/math/i386/rint.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global rint
-.type rint,@function
-rint:
-	fldl 4(%esp)
-	frndint
-	ret
diff --git a/src/math/i386/rintf.c b/src/math/i386/rintf.c
new file mode 100644
index 00000000..bb4121a4
--- /dev/null
+++ b/src/math/i386/rintf.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+float rintf(float x)
+{
+	__asm__ ("frndint" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/rintf.s b/src/math/i386/rintf.s
deleted file mode 100644
index bce4c5a6..00000000
--- a/src/math/i386/rintf.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global rintf
-.type rintf,@function
-rintf:
-	flds 4(%esp)
-	frndint
-	ret
diff --git a/src/math/i386/rintl.c b/src/math/i386/rintl.c
new file mode 100644
index 00000000..e1a92077
--- /dev/null
+++ b/src/math/i386/rintl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double rintl(long double x)
+{
+	__asm__ ("frndint" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/rintl.s b/src/math/i386/rintl.s
deleted file mode 100644
index cd2bf9a9..00000000
--- a/src/math/i386/rintl.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global rintl
-.type rintl,@function
-rintl:
-	fldt 4(%esp)
-	frndint
-	ret
diff --git a/src/math/i386/sqrt.c b/src/math/i386/sqrt.c
new file mode 100644
index 00000000..934fbcca
--- /dev/null
+++ b/src/math/i386/sqrt.c
@@ -0,0 +1,15 @@
+#include "libm.h"
+
+double sqrt(double x)
+{
+	union ldshape ux;
+	unsigned fpsr;
+	__asm__ ("fsqrt; fnstsw %%ax": "=t"(ux.f), "=a"(fpsr) : "0"(x));
+	if ((ux.i.m & 0x7ff) != 0x400)
+		return (double)ux.f;
+	/* Rounding to double would have encountered an exact halfway case.
+	   Adjust mantissa downwards if fsqrt rounded up, else upwards.
+	   (result of fsqrt could not have been exact) */
+	ux.i.m ^= (fpsr & 0x200) + 0x300;
+	return (double)ux.f;
+}
diff --git a/src/math/i386/sqrt.s b/src/math/i386/sqrt.s
deleted file mode 100644
index 57837e25..00000000
--- a/src/math/i386/sqrt.s
+++ /dev/null
@@ -1,21 +0,0 @@
-.global sqrt
-.type sqrt,@function
-sqrt:	fldl 4(%esp)
-	fsqrt
-	fnstsw %ax
-	sub $12,%esp
-	fld %st(0)
-	fstpt (%esp)
-	mov (%esp),%ecx
-	and $0x7ff,%ecx
-	cmp $0x400,%ecx
-	jnz 1f
-	and $0x200,%eax
-	sub $0x100,%eax
-	sub %eax,(%esp)
-	fstp %st(0)
-	fldt (%esp)
-1:	add $12,%esp
-	fstpl 4(%esp)
-	fldl 4(%esp)
-	ret
diff --git a/src/math/i386/sqrtf.c b/src/math/i386/sqrtf.c
new file mode 100644
index 00000000..41c65c2b
--- /dev/null
+++ b/src/math/i386/sqrtf.c
@@ -0,0 +1,12 @@
+#include <math.h>
+
+float sqrtf(float x)
+{
+	long double t;
+	/* The long double result has sufficient precision so that
+	 * second rounding to float still keeps the returned value
+	 * correctly rounded, see Pierre Roux, "Innocuous Double
+	 * Rounding of Basic Arithmetic Operations". */
+	__asm__ ("fsqrt" : "=t"(t) : "0"(x));
+	return (float)t;
+}
diff --git a/src/math/i386/sqrtf.s b/src/math/i386/sqrtf.s
deleted file mode 100644
index 9e944f45..00000000
--- a/src/math/i386/sqrtf.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global sqrtf
-.type sqrtf,@function
-sqrtf:	flds 4(%esp)
-	fsqrt
-	fstps 4(%esp)
-	flds 4(%esp)
-	ret
diff --git a/src/math/i386/sqrtl.c b/src/math/i386/sqrtl.c
new file mode 100644
index 00000000..864cfcc4
--- /dev/null
+++ b/src/math/i386/sqrtl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double sqrtl(long double x)
+{
+	__asm__ ("fsqrt" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/sqrtl.s b/src/math/i386/sqrtl.s
deleted file mode 100644
index e0d42616..00000000
--- a/src/math/i386/sqrtl.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global sqrtl
-.type sqrtl,@function
-sqrtl:	fldt 4(%esp)
-	fsqrt
-	ret
diff --git a/src/math/logf.c b/src/math/logf.c
index 7ee5d7fe..e4c2237c 100644
--- a/src/math/logf.c
+++ b/src/math/logf.c
@@ -53,7 +53,7 @@ float logf(float x)
 	tmp = ix - OFF;
 	i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
 	k = (int32_t)tmp >> 23; /* arithmetic shift */
-	iz = ix - (tmp & 0x1ff << 23);
+	iz = ix - (tmp & 0xff800000);
 	invc = T[i].invc;
 	logc = T[i].logc;
 	z = (double_t)asfloat(iz);
diff --git a/src/math/m68k/sqrtl.c b/src/math/m68k/sqrtl.c
new file mode 100644
index 00000000..b1c303c7
--- /dev/null
+++ b/src/math/m68k/sqrtl.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __HAVE_68881__
+
+long double sqrtl(long double x)
+{
+	__asm__ ("fsqrt.x %1,%0" : "=f"(x) : "fm"(x));
+	return x;
+}
+
+#else
+
+#include "../sqrtl.c"
+
+#endif
diff --git a/src/math/powerpc/fabs.c b/src/math/powerpc/fabs.c
index 0efc21ef..9453a3aa 100644
--- a/src/math/powerpc/fabs.c
+++ b/src/math/powerpc/fabs.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if defined(_SOFT_FLOAT) || defined(BROKEN_PPC_D_ASM)
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) || defined(BROKEN_PPC_D_ASM)
 
 #include "../fabs.c"
 
diff --git a/src/math/powerpc/fabsf.c b/src/math/powerpc/fabsf.c
index d88b5911..2e9da588 100644
--- a/src/math/powerpc/fabsf.c
+++ b/src/math/powerpc/fabsf.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#ifdef _SOFT_FLOAT
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)
 
 #include "../fabsf.c"
 
diff --git a/src/math/powerpc/fma.c b/src/math/powerpc/fma.c
index 135c9903..0eb2ba1e 100644
--- a/src/math/powerpc/fma.c
+++ b/src/math/powerpc/fma.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if defined(_SOFT_FLOAT) || defined(BROKEN_PPC_D_ASM)
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) || defined(BROKEN_PPC_D_ASM)
 
 #include "../fma.c"
 
diff --git a/src/math/powerpc/fmaf.c b/src/math/powerpc/fmaf.c
index a99a2a3b..dc1a749d 100644
--- a/src/math/powerpc/fmaf.c
+++ b/src/math/powerpc/fmaf.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#ifdef _SOFT_FLOAT
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)
 
 #include "../fmaf.c"
 
diff --git a/src/math/powl.c b/src/math/powl.c
index 5b6da07b..9eb22162 100644
--- a/src/math/powl.c
+++ b/src/math/powl.c
@@ -57,14 +57,6 @@
  *    IEEE     0,8700       60000      6.5e-18      1.0e-18
  * 0.99 < x < 1.01, 0 < y < 8700, uniformly distributed.
  *
- *
- * ERROR MESSAGES:
- *
- *   message         condition      value returned
- * pow overflow     x**y > MAXNUM      INFINITY
- * pow underflow   x**y < 1/MAXNUM       0.0
- * pow domain      x<0 and y noninteger  0.0
- *
  */
 
 #include "libm.h"
@@ -212,25 +204,33 @@ long double powl(long double x, long double y)
 	}
 	if (x == 1.0)
 		return 1.0; /* 1**y = 1, even if y is nan */
-	if (x == -1.0 && !isfinite(y))
-		return 1.0; /* -1**inf = 1 */
 	if (y == 0.0)
 		return 1.0; /* x**0 = 1, even if x is nan */
 	if (y == 1.0)
 		return x;
-	if (y >= LDBL_MAX) {
-		if (x > 1.0 || x < -1.0)
-			return INFINITY;
-		if (x != 0.0)
-			return 0.0;
-	}
-	if (y <= -LDBL_MAX) {
-		if (x > 1.0 || x < -1.0)
+	/* if y*log2(x) < log2(LDBL_TRUE_MIN)-1 then x^y uflows to 0
+	   if y*log2(x) > -log2(LDBL_TRUE_MIN)+1 > LDBL_MAX_EXP then x^y oflows
+	   if |x|!=1 then |log2(x)| > |log(x)| > LDBL_EPSILON/2 so
+	   x^y oflows/uflows if |y|*LDBL_EPSILON/2 > -log2(LDBL_TRUE_MIN)+1 */
+	if (fabsl(y) > 2*(-LDBL_MIN_EXP+LDBL_MANT_DIG+1)/LDBL_EPSILON) {
+		/* y is not an odd int */
+		if (x == -1.0)
+			return 1.0;
+		if (y == INFINITY) {
+			if (x > 1.0 || x < -1.0)
+				return INFINITY;
 			return 0.0;
-		if (x != 0.0 || y == -INFINITY)
+		}
+		if (y == -INFINITY) {
+			if (x > 1.0 || x < -1.0)
+				return 0.0;
 			return INFINITY;
+		}
+		if ((x > 1.0 || x < -1.0) == (y > 0))
+			return huge * huge;
+		return twom10000 * twom10000;
 	}
-	if (x >= LDBL_MAX) {
+	if (x == INFINITY) {
 		if (y > 0.0)
 			return INFINITY;
 		return 0.0;
@@ -253,7 +253,7 @@ long double powl(long double x, long double y)
 			yoddint = 1;
 	}
 
-	if (x <= -LDBL_MAX) {
+	if (x == -INFINITY) {
 		if (y > 0.0) {
 			if (yoddint)
 				return -INFINITY;
diff --git a/src/math/riscv32/copysign.c b/src/math/riscv32/copysign.c
new file mode 100644
index 00000000..c7854178
--- /dev/null
+++ b/src/math/riscv32/copysign.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double copysign(double x, double y)
+{
+	__asm__ ("fsgnj.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../copysign.c"
+
+#endif
diff --git a/src/math/riscv32/copysignf.c b/src/math/riscv32/copysignf.c
new file mode 100644
index 00000000..a125611a
--- /dev/null
+++ b/src/math/riscv32/copysignf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float copysignf(float x, float y)
+{
+	__asm__ ("fsgnj.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../copysignf.c"
+
+#endif
diff --git a/src/math/riscv32/fabs.c b/src/math/riscv32/fabs.c
new file mode 100644
index 00000000..5290b6f0
--- /dev/null
+++ b/src/math/riscv32/fabs.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double fabs(double x)
+{
+	__asm__ ("fabs.d %0, %1" : "=f"(x) : "f"(x));
+	return x;
+}
+
+#else
+
+#include "../fabs.c"
+
+#endif
diff --git a/src/math/riscv32/fabsf.c b/src/math/riscv32/fabsf.c
new file mode 100644
index 00000000..f5032e35
--- /dev/null
+++ b/src/math/riscv32/fabsf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float fabsf(float x)
+{
+	__asm__ ("fabs.s %0, %1" : "=f"(x) : "f"(x));
+	return x;
+}
+
+#else
+
+#include "../fabsf.c"
+
+#endif
diff --git a/src/math/riscv32/fma.c b/src/math/riscv32/fma.c
new file mode 100644
index 00000000..99b05713
--- /dev/null
+++ b/src/math/riscv32/fma.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double fma(double x, double y, double z)
+{
+	__asm__ ("fmadd.d %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z));
+	return x;
+}
+
+#else
+
+#include "../fma.c"
+
+#endif
diff --git a/src/math/riscv32/fmaf.c b/src/math/riscv32/fmaf.c
new file mode 100644
index 00000000..f9dc47ed
--- /dev/null
+++ b/src/math/riscv32/fmaf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float fmaf(float x, float y, float z)
+{
+	__asm__ ("fmadd.s %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z));
+	return x;
+}
+
+#else
+
+#include "../fmaf.c"
+
+#endif
diff --git a/src/math/riscv32/fmax.c b/src/math/riscv32/fmax.c
new file mode 100644
index 00000000..023709cd
--- /dev/null
+++ b/src/math/riscv32/fmax.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double fmax(double x, double y)
+{
+	__asm__ ("fmax.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../fmax.c"
+
+#endif
diff --git a/src/math/riscv32/fmaxf.c b/src/math/riscv32/fmaxf.c
new file mode 100644
index 00000000..863d2bd1
--- /dev/null
+++ b/src/math/riscv32/fmaxf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float fmaxf(float x, float y)
+{
+	__asm__ ("fmax.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../fmaxf.c"
+
+#endif
diff --git a/src/math/riscv32/fmin.c b/src/math/riscv32/fmin.c
new file mode 100644
index 00000000..a4e3b067
--- /dev/null
+++ b/src/math/riscv32/fmin.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double fmin(double x, double y)
+{
+	__asm__ ("fmin.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../fmin.c"
+
+#endif
diff --git a/src/math/riscv32/fminf.c b/src/math/riscv32/fminf.c
new file mode 100644
index 00000000..32156e80
--- /dev/null
+++ b/src/math/riscv32/fminf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float fminf(float x, float y)
+{
+	__asm__ ("fmin.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../fminf.c"
+
+#endif
diff --git a/src/math/riscv32/sqrt.c b/src/math/riscv32/sqrt.c
new file mode 100644
index 00000000..867a504c
--- /dev/null
+++ b/src/math/riscv32/sqrt.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double sqrt(double x)
+{
+	__asm__ ("fsqrt.d %0, %1" : "=f"(x) : "f"(x));
+	return x;
+}
+
+#else
+
+#include "../sqrt.c"
+
+#endif
diff --git a/src/math/riscv32/sqrtf.c b/src/math/riscv32/sqrtf.c
new file mode 100644
index 00000000..610c2cf8
--- /dev/null
+++ b/src/math/riscv32/sqrtf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float sqrtf(float x)
+{
+	__asm__ ("fsqrt.s %0, %1" : "=f"(x) : "f"(x));
+	return x;
+}
+
+#else
+
+#include "../sqrtf.c"
+
+#endif
diff --git a/src/math/sinh.c b/src/math/sinh.c
index 00022c4e..a01951ae 100644
--- a/src/math/sinh.c
+++ b/src/math/sinh.c
@@ -34,6 +34,6 @@ double sinh(double x)
 
 	/* |x| > log(DBL_MAX) or nan */
 	/* note: the result is stored to handle overflow */
-	t = 2*h*__expo2(absx);
+	t = __expo2(absx, 2*h);
 	return t;
 }
diff --git a/src/math/sinhf.c b/src/math/sinhf.c
index 6ad19ea2..b9caa793 100644
--- a/src/math/sinhf.c
+++ b/src/math/sinhf.c
@@ -26,6 +26,6 @@ float sinhf(float x)
 	}
 
 	/* |x| > logf(FLT_MAX) or nan */
-	t = 2*h*__expo2f(absx);
+	t = __expo2f(absx, 2*h);
 	return t;
 }
diff --git a/src/math/sqrt.c b/src/math/sqrt.c
index f1f6d76c..5ba26559 100644
--- a/src/math/sqrt.c
+++ b/src/math/sqrt.c
@@ -1,184 +1,158 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrt.c */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-/* sqrt(x)
- * Return correctly rounded sqrt.
- *           ------------------------------------------
- *           |  Use the hardware sqrt if you have one |
- *           ------------------------------------------
- * Method:
- *   Bit by bit method using integer arithmetic. (Slow, but portable)
- *   1. Normalization
- *      Scale x to y in [1,4) with even powers of 2:
- *      find an integer k such that  1 <= (y=x*2^(2k)) < 4, then
- *              sqrt(x) = 2^k * sqrt(y)
- *   2. Bit by bit computation
- *      Let q  = sqrt(y) truncated to i bit after binary point (q = 1),
- *           i                                                   0
- *                                     i+1         2
- *          s  = 2*q , and      y  =  2   * ( y - q  ).         (1)
- *           i      i            i                 i
- *
- *      To compute q    from q , one checks whether
- *                  i+1       i
- *
- *                            -(i+1) 2
- *                      (q + 2      ) <= y.                     (2)
- *                        i
- *                                                            -(i+1)
- *      If (2) is false, then q   = q ; otherwise q   = q  + 2      .
- *                             i+1   i             i+1   i
- *
- *      With some algebric manipulation, it is not difficult to see
- *      that (2) is equivalent to
- *                             -(i+1)
- *                      s  +  2       <= y                      (3)
- *                       i                i
- *
- *      The advantage of (3) is that s  and y  can be computed by
- *                                    i      i
- *      the following recurrence formula:
- *          if (3) is false
- *
- *          s     =  s  ,       y    = y   ;                    (4)
- *           i+1      i          i+1    i
- *
- *          otherwise,
- *                         -i                     -(i+1)
- *          s     =  s  + 2  ,  y    = y  -  s  - 2             (5)
- *           i+1      i          i+1    i     i
- *
- *      One may easily use induction to prove (4) and (5).
- *      Note. Since the left hand side of (3) contain only i+2 bits,
- *            it does not necessary to do a full (53-bit) comparison
- *            in (3).
- *   3. Final rounding
- *      After generating the 53 bits result, we compute one more bit.
- *      Together with the remainder, we can decide whether the
- *      result is exact, bigger than 1/2ulp, or less than 1/2ulp
- *      (it will never equal to 1/2ulp).
- *      The rounding mode can be detected by checking whether
- *      huge + tiny is equal to huge, and whether huge - tiny is
- *      equal to huge for some floating point number "huge" and "tiny".
- *
- * Special cases:
- *      sqrt(+-0) = +-0         ... exact
- *      sqrt(inf) = inf
- *      sqrt(-ve) = NaN         ... with invalid signal
- *      sqrt(NaN) = NaN         ... with invalid signal for signaling NaN
- */
-
+#include <stdint.h>
+#include <math.h>
 #include "libm.h"
+#include "sqrt_data.h"
 
-static const double tiny = 1.0e-300;
+#define FENV_SUPPORT 1
 
-double sqrt(double x)
+/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
 {
-	double z;
-	int32_t sign = (int)0x80000000;
-	int32_t ix0,s0,q,m,t,i;
-	uint32_t r,t1,s1,ix1,q1;
+	return (uint64_t)a*b >> 32;
+}
 
-	EXTRACT_WORDS(ix0, ix1, x);
+/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
 
-	/* take care of Inf and NaN */
-	if ((ix0&0x7ff00000) == 0x7ff00000) {
-		return x*x + x;  /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */
-	}
-	/* take care of zero */
-	if (ix0 <= 0) {
-		if (((ix0&~sign)|ix1) == 0)
-			return x;  /* sqrt(+-0) = +-0 */
-		if (ix0 < 0)
-			return (x-x)/(x-x);  /* sqrt(-ve) = sNaN */
-	}
-	/* normalize x */
-	m = ix0>>20;
-	if (m == 0) {  /* subnormal x */
-		while (ix0 == 0) {
-			m -= 21;
-			ix0 |= (ix1>>11);
-			ix1 <<= 21;
-		}
-		for (i=0; (ix0&0x00100000) == 0; i++)
-			ix0<<=1;
-		m -= i - 1;
-		ix0 |= ix1>>(32-i);
-		ix1 <<= i;
-	}
-	m -= 1023;    /* unbias exponent */
-	ix0 = (ix0&0x000fffff)|0x00100000;
-	if (m & 1) {  /* odd m, double x to make it even */
-		ix0 += ix0 + ((ix1&sign)>>31);
-		ix1 += ix1;
-	}
-	m >>= 1;      /* m = [m/2] */
-
-	/* generate sqrt(x) bit by bit */
-	ix0 += ix0 + ((ix1&sign)>>31);
-	ix1 += ix1;
-	q = q1 = s0 = s1 = 0;  /* [q,q1] = sqrt(x) */
-	r = 0x00200000;        /* r = moving bit from right to left */
-
-	while (r != 0) {
-		t = s0 + r;
-		if (t <= ix0) {
-			s0   = t + r;
-			ix0 -= t;
-			q   += r;
-		}
-		ix0 += ix0 + ((ix1&sign)>>31);
-		ix1 += ix1;
-		r >>= 1;
-	}
+double sqrt(double x)
+{
+	uint64_t ix, top, m;
 
-	r = sign;
-	while (r != 0) {
-		t1 = s1 + r;
-		t  = s0;
-		if (t < ix0 || (t == ix0 && t1 <= ix1)) {
-			s1 = t1 + r;
-			if ((t1&sign) == sign && (s1&sign) == 0)
-				s0++;
-			ix0 -= t;
-			if (ix1 < t1)
-				ix0--;
-			ix1 -= t1;
-			q1 += r;
-		}
-		ix0 += ix0 + ((ix1&sign)>>31);
-		ix1 += ix1;
-		r >>= 1;
+	/* special case handling.  */
+	ix = asuint64(x);
+	top = ix >> 52;
+	if (predict_false(top - 0x001 >= 0x7ff - 0x001)) {
+		/* x < 0x1p-1022 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7ff0000000000000)
+			return x;
+		if (ix > 0x7ff0000000000000)
+			return __math_invalid(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint64(x * 0x1p52);
+		top = ix >> 52;
+		top -= 52;
 	}
 
-	/* use floating add to find out rounding direction */
-	if ((ix0|ix1) != 0) {
-		z = 1.0 - tiny; /* raise inexact flag */
-		if (z >= 1.0) {
-			z = 1.0 + tiny;
-			if (q1 == (uint32_t)0xffffffff) {
-				q1 = 0;
-				q++;
-			} else if (z > 1.0) {
-				if (q1 == (uint32_t)0xfffffffe)
-					q++;
-				q1 += 2;
-			} else
-				q1 += q1 & 1;
-		}
+	/* argument reduction:
+	   x = 4^e m; with integer e, and m in [1, 4)
+	   m: fixed point representation [2.62]
+	   2^e is the exponent part of the result.  */
+	int even = top & 1;
+	m = (ix << 11) | 0x8000000000000000;
+	if (even) m >>= 1;
+	top = (top + 0x3ff) >> 1;
+
+	/* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4)
+
+	   initial estimate:
+	   7bit table lookup (1bit exponent and 6bit significand).
+
+	   iterative approximation:
+	   using 2 goldschmidt iterations with 32bit int arithmetics
+	   and a final iteration with 64bit int arithmetics.
+
+	   details:
+
+	   the relative error (e = r0 sqrt(m)-1) of a linear estimate
+	   (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
+	   a table lookup is faster and needs one less iteration
+	   6 bit lookup table (128b) gives |e| < 0x1.f9p-8
+	   7 bit lookup table (256b) gives |e| < 0x1.fdp-9
+	   for single and double prec 6bit is enough but for quad
+	   prec 7bit is needed (or modified iterations). to avoid
+	   one more iteration >=13bit table would be needed (16k).
+
+	   a newton-raphson iteration for r is
+	     w = r*r
+	     u = 3 - m*w
+	     r = r*u/2
+	   can use a goldschmidt iteration for s at the end or
+	     s = m*r
+
+	   first goldschmidt iteration is
+	     s = m*r
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   next goldschmidt iteration is
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   and at the end r is not computed only s.
+
+	   they use the same amount of operations and converge at the
+	   same quadratic rate, i.e. if
+	     r1 sqrt(m) - 1 = e, then
+	     r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3
+	   the advantage of goldschmidt is that the mul for s and r
+	   are independent (computed in parallel), however it is not
+	   "self synchronizing": it only uses the input m in the
+	   first iteration so rounding errors accumulate. at the end
+	   or when switching to larger precision arithmetics rounding
+	   errors dominate so the first iteration should be used.
+
+	   the fixed point representations are
+	     m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30
+	   and after switching to 64 bit
+	     m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62  */
+
+	static const uint64_t three = 0xc0000000;
+	uint64_t r, s, d, u, i;
+
+	i = (ix >> 46) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r sqrt(m) - 1| < 0x1.fdp-9 */
+	s = mul32(m>>32, r);
+	/* |s/sqrt(m) - 1| < 0x1.fdp-9 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */
+	r = r << 32;
+	s = mul64(m, r);
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	s = mul64(s, u);  /* repr: 3.61 */
+	/* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */
+	s = (s - 2) >> 9; /* repr: 12.52 */
+	/* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */
+
+	/* s < sqrt(m) < s + 0x1.09p-52,
+	   compute nearest rounded result:
+	   the nearest result to 52 bits is either s or s+0x1p-52,
+	   we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m.  */
+	uint64_t d0, d1, d2;
+	double y, t;
+	d0 = (m << 42) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 63;
+	s &= 0x000fffffffffffff;
+	s |= top << 52;
+	y = asdouble(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding modes and inexact exception:
+		   only (s+1)^2 == 2^42 m case is exact otherwise
+		   add a tiny value to cause the fenv effects.  */
+		uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000;
+		tiny |= (d1^d2) & 0x8000000000000000;
+		t = asdouble(tiny);
+		y = eval_as_double(y + t);
 	}
-	ix0 = (q>>1) + 0x3fe00000;
-	ix1 = q1>>1;
-	if (q&1)
-		ix1 |= sign;
-	INSERT_WORDS(z, ix0 + ((uint32_t)m << 20), ix1);
-	return z;
+	return y;
 }
diff --git a/src/math/sqrt_data.c b/src/math/sqrt_data.c
new file mode 100644
index 00000000..61bc22f4
--- /dev/null
+++ b/src/math/sqrt_data.c
@@ -0,0 +1,19 @@
+#include "sqrt_data.h"
+const uint16_t __rsqrt_tab[128] = {
+0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43,
+0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b,
+0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1,
+0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430,
+0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59,
+0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925,
+0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479,
+0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040,
+0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234,
+0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2,
+0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1,
+0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192,
+0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f,
+0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4,
+0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59,
+0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560,
+};
diff --git a/src/math/sqrt_data.h b/src/math/sqrt_data.h
new file mode 100644
index 00000000..260c7f9c
--- /dev/null
+++ b/src/math/sqrt_data.h
@@ -0,0 +1,13 @@
+#ifndef _SQRT_DATA_H
+#define _SQRT_DATA_H
+
+#include <features.h>
+#include <stdint.h>
+
+/* if x in [1,2): i = (int)(64*x);
+   if x in [2,4): i = (int)(32*x-64);
+   __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error:
+   |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */
+extern hidden const uint16_t __rsqrt_tab[128];
+
+#endif
diff --git a/src/math/sqrtf.c b/src/math/sqrtf.c
index d6ace38a..740d81cb 100644
--- a/src/math/sqrtf.c
+++ b/src/math/sqrtf.c
@@ -1,83 +1,83 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrtf.c */
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
+#include <stdint.h>
+#include <math.h>
 #include "libm.h"
+#include "sqrt_data.h"
 
-static const float tiny = 1.0e-30;
+#define FENV_SUPPORT 1
 
-float sqrtf(float x)
+static inline uint32_t mul32(uint32_t a, uint32_t b)
 {
-	float z;
-	int32_t sign = (int)0x80000000;
-	int32_t ix,s,q,m,t,i;
-	uint32_t r;
+	return (uint64_t)a*b >> 32;
+}
 
-	GET_FLOAT_WORD(ix, x);
+/* see sqrt.c for more detailed comments.  */
 
-	/* take care of Inf and NaN */
-	if ((ix&0x7f800000) == 0x7f800000)
-		return x*x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */
+float sqrtf(float x)
+{
+	uint32_t ix, m, m1, m0, even, ey;
 
-	/* take care of zero */
-	if (ix <= 0) {
-		if ((ix&~sign) == 0)
-			return x;  /* sqrt(+-0) = +-0 */
-		if (ix < 0)
-			return (x-x)/(x-x);  /* sqrt(-ve) = sNaN */
-	}
-	/* normalize x */
-	m = ix>>23;
-	if (m == 0) {  /* subnormal x */
-		for (i = 0; (ix&0x00800000) == 0; i++)
-			ix<<=1;
-		m -= i - 1;
+	ix = asuint(x);
+	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+		/* x < 0x1p-126 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7f800000)
+			return x;
+		if (ix > 0x7f800000)
+			return __math_invalidf(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint(x * 0x1p23f);
+		ix -= 23 << 23;
 	}
-	m -= 127;  /* unbias exponent */
-	ix = (ix&0x007fffff)|0x00800000;
-	if (m&1)  /* odd m, double x to make it even */
-		ix += ix;
-	m >>= 1;  /* m = [m/2] */
 
-	/* generate sqrt(x) bit by bit */
-	ix += ix;
-	q = s = 0;       /* q = sqrt(x) */
-	r = 0x01000000;  /* r = moving bit from right to left */
+	/* x = 4^e m; with int e and m in [1, 4).  */
+	even = ix & 0x00800000;
+	m1 = (ix << 8) | 0x80000000;
+	m0 = (ix << 7) & 0x7fffffff;
+	m = even ? m0 : m1;
 
-	while (r != 0) {
-		t = s + r;
-		if (t <= ix) {
-			s = t+r;
-			ix -= t;
-			q += r;
-		}
-		ix += ix;
-		r >>= 1;
-	}
+	/* 2^e is the exponent part of the return value.  */
+	ey = ix >> 1;
+	ey += 0x3f800000 >> 1;
+	ey &= 0x7f800000;
+
+	/* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations.  */
+	static const uint32_t three = 0xc0000000;
+	uint32_t r, s, d, u, i;
+	i = (ix >> 17) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r*sqrt(m) - 1| < 0x1p-8 */
+	s = mul32(m, r);
+	/* |s/sqrt(m) - 1| < 0x1p-8 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r*sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	s = mul32(s, u);
+	/* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */
+	s = (s - 1)>>6;
+	/* s < sqrt(m) < s + 0x1.08p-23 */
 
-	/* use floating add to find out rounding direction */
-	if (ix != 0) {
-		z = 1.0f - tiny; /* raise inexact flag */
-		if (z >= 1.0f) {
-			z = 1.0f + tiny;
-			if (z > 1.0f)
-				q += 2;
-			else
-				q += q & 1;
-		}
+	/* compute nearest rounded result.  */
+	uint32_t d0, d1, d2;
+	float y, t;
+	d0 = (m << 16) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 31;
+	s &= 0x007fffff;
+	s |= ey;
+	y = asfloat(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding and inexact exception. */
+		uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000;
+		tiny |= (d1^d2) & 0x80000000;
+		t = asfloat(tiny);
+		y = eval_as_float(y + t);
 	}
-	ix = (q>>1) + 0x3f000000;
-	SET_FLOAT_WORD(z, ix + ((uint32_t)m << 23));
-	return z;
+	return y;
 }
diff --git a/src/math/sqrtl.c b/src/math/sqrtl.c
index 83a8f80c..a231b3f2 100644
--- a/src/math/sqrtl.c
+++ b/src/math/sqrtl.c
@@ -1,7 +1,259 @@
+#include <stdint.h>
 #include <math.h>
+#include <float.h>
+#include "libm.h"
 
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
 long double sqrtl(long double x)
 {
-	/* FIXME: implement in C, this is for LDBL_MANT_DIG == 64 only */
 	return sqrt(x);
 }
+#elif (LDBL_MANT_DIG == 113 || LDBL_MANT_DIG == 64) && LDBL_MAX_EXP == 16384
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+typedef struct {
+	uint64_t hi;
+	uint64_t lo;
+} u128;
+
+/* top: 16 bit sign+exponent, x: significand.  */
+static inline long double mkldbl(uint64_t top, u128 x)
+{
+	union ldshape u;
+#if LDBL_MANT_DIG == 113
+	u.i2.hi = x.hi;
+	u.i2.lo = x.lo;
+	u.i2.hi &= 0x0000ffffffffffff;
+	u.i2.hi |= top << 48;
+#elif LDBL_MANT_DIG == 64
+	u.i.se = top;
+	u.i.m = x.lo;
+	/* force the top bit on non-zero (and non-subnormal) results.  */
+	if (top & 0x7fff)
+		u.i.m |= 0x8000000000000000;
+#endif
+	return u.f;
+}
+
+/* return: top 16 bit is sign+exp and following bits are the significand.  */
+static inline u128 asu128(long double x)
+{
+	union ldshape u = {.f=x};
+	u128 r;
+#if LDBL_MANT_DIG == 113
+	r.hi = u.i2.hi;
+	r.lo = u.i2.lo;
+#elif LDBL_MANT_DIG == 64
+	r.lo = u.i.m<<49;
+	/* ignore the top bit: pseudo numbers are not handled. */
+	r.hi = u.i.m>>15;
+	r.hi &= 0x0000ffffffffffff;
+	r.hi |= (uint64_t)u.i.se << 48;
+#endif
+	return r;
+}
+
+/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+	return (uint64_t)a*b >> 32;
+}
+
+/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
+
+static inline u128 add64(u128 a, uint64_t b)
+{
+	u128 r;
+	r.lo = a.lo + b;
+	r.hi = a.hi;
+	if (r.lo < a.lo)
+		r.hi++;
+	return r;
+}
+
+static inline u128 add128(u128 a, u128 b)
+{
+	u128 r;
+	r.lo = a.lo + b.lo;
+	r.hi = a.hi + b.hi;
+	if (r.lo < a.lo)
+		r.hi++;
+	return r;
+}
+
+static inline u128 sub64(u128 a, uint64_t b)
+{
+	u128 r;
+	r.lo = a.lo - b;
+	r.hi = a.hi;
+	if (a.lo < b)
+		r.hi--;
+	return r;
+}
+
+static inline u128 sub128(u128 a, u128 b)
+{
+	u128 r;
+	r.lo = a.lo - b.lo;
+	r.hi = a.hi - b.hi;
+	if (a.lo < b.lo)
+		r.hi--;
+	return r;
+}
+
+/* a<<n, 0 <= n <= 127 */
+static inline u128 lsh(u128 a, int n)
+{
+	if (n == 0)
+		return a;
+	if (n >= 64) {
+		a.hi = a.lo<<(n-64);
+		a.lo = 0;
+	} else {
+		a.hi = (a.hi<<n) | (a.lo>>(64-n));
+		a.lo = a.lo<<n;
+	}
+	return a;
+}
+
+/* a>>n, 0 <= n <= 127 */
+static inline u128 rsh(u128 a, int n)
+{
+	if (n == 0)
+		return a;
+	if (n >= 64) {
+		a.lo = a.hi>>(n-64);
+		a.hi = 0;
+	} else {
+		a.lo = (a.lo>>n) | (a.hi<<(64-n));
+		a.hi = a.hi>>n;
+	}
+	return a;
+}
+
+/* returns a*b exactly.  */
+static inline u128 mul64_128(uint64_t a, uint64_t b)
+{
+	u128 r;
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32);
+	uint64_t lo2 = (alo*blo)&0xffffffff;
+	r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32);
+	r.lo = (lo1<<32) + lo2;
+	return r;
+}
+
+/* returns a*b*2^-128 - e, with error 0 <= e < 7.  */
+static inline u128 mul128(u128 a, u128 b)
+{
+	u128 hi = mul64_128(a.hi, b.hi);
+	uint64_t m1 = mul64(a.hi, b.lo);
+	uint64_t m2 = mul64(a.lo, b.hi);
+	return add64(add64(hi, m1), m2);
+}
+
+/* returns a*b % 2^128.  */
+static inline u128 mul128_tail(u128 a, u128 b)
+{
+	u128 lo = mul64_128(a.lo, b.lo);
+	lo.hi += a.hi*b.lo + a.lo*b.hi;
+	return lo;
+}
+
+
+/* see sqrt.c for detailed comments.  */
+
+long double sqrtl(long double x)
+{
+	u128 ix, ml;
+	uint64_t top;
+
+	ix = asu128(x);
+	top = ix.hi >> 48;
+	if (predict_false(top - 0x0001 >= 0x7fff - 0x0001)) {
+		/* x < 0x1p-16382 or inf or nan.  */
+		if (2*ix.hi == 0 && ix.lo == 0)
+			return x;
+		if (ix.hi == 0x7fff000000000000 && ix.lo == 0)
+			return x;
+		if (top >= 0x7fff)
+			return __math_invalidl(x);
+		/* x is subnormal, normalize it.  */
+		ix = asu128(x * 0x1p112);
+		top = ix.hi >> 48;
+		top -= 112;
+	}
+
+	/* x = 4^e m; with int e and m in [1, 4) */
+	int even = top & 1;
+	ml = lsh(ix, 15);
+	ml.hi |= 0x8000000000000000;
+	if (even) ml = rsh(ml, 1);
+	top = (top + 0x3fff) >> 1;
+
+	/* r ~ 1/sqrt(m) */
+	const uint64_t three = 0xc0000000;
+	uint64_t r, s, d, u, i;
+	i = (ix.hi >> 42) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r sqrt(m) - 1| < 0x1p-8 */
+	s = mul32(ml.hi>>32, r);
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(u, r) << 1;
+	/* |r sqrt(m) - 1| < 0x1.7bp-16, switch to 64bit */
+	r = r<<32;
+	s = mul64(ml.hi, r);
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	r = mul64(u, r) << 1;
+	/* |r sqrt(m) - 1| < 0x1.a5p-31 */
+	s = mul64(u, s) << 1;
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	r = mul64(u, r) << 1;
+	/* |r sqrt(m) - 1| < 0x1.c001p-59, switch to 128bit */
+
+	const u128 threel = {.hi=three<<32, .lo=0};
+	u128 rl, sl, dl, ul;
+	rl.hi = r;
+	rl.lo = 0;
+	sl = mul128(ml, rl);
+	dl = mul128(sl, rl);
+	ul = sub128(threel, dl);
+	sl = mul128(ul, sl); /* repr: 3.125 */
+	/* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */
+	sl = rsh(sub64(sl, 4), 125-(LDBL_MANT_DIG-1));
+	/* s < sqrt(m) < s + 1 ULP + tiny */
+
+	long double y;
+	u128 d2, d1, d0;
+	d0 = sub128(lsh(ml, 2*(LDBL_MANT_DIG-1)-126), mul128_tail(sl,sl));
+	d1 = sub128(sl, d0);
+	d2 = add128(add64(sl, 1), d1);
+	sl = add64(sl, d1.hi >> 63);
+	y = mkldbl(top, sl);
+	if (FENV_SUPPORT) {
+		/* handle rounding modes and inexact exception.  */
+		top = predict_false((d2.hi|d2.lo)==0) ? 0 : 1;
+		top |= ((d1.hi^d2.hi)&0x8000000000000000) >> 48;
+		y += mkldbl(top, (u128){0});
+	}
+	return y;
+}
+#else
+#error unsupported long double format
+#endif
diff --git a/src/math/x86_64/fabs.c b/src/math/x86_64/fabs.c
new file mode 100644
index 00000000..16562477
--- /dev/null
+++ b/src/math/x86_64/fabs.c
@@ -0,0 +1,10 @@
+#include <math.h>
+
+double fabs(double x)
+{
+	double t;
+	__asm__ ("pcmpeqd %0, %0" : "=x"(t));          // t = ~0
+	__asm__ ("psrlq   $1, %0" : "+x"(t));          // t >>= 1
+	__asm__ ("andps   %1, %0" : "+x"(x) : "x"(t)); // x &= t
+	return x;
+}
diff --git a/src/math/x86_64/fabs.s b/src/math/x86_64/fabs.s
deleted file mode 100644
index 5715005e..00000000
--- a/src/math/x86_64/fabs.s
+++ /dev/null
@@ -1,9 +0,0 @@
-.global fabs
-.type fabs,@function
-fabs:
-	xor %eax,%eax
-	dec %rax
-	shr %rax
-	movq %rax,%xmm1
-	andpd %xmm1,%xmm0
-	ret
diff --git a/src/math/x86_64/fabsf.c b/src/math/x86_64/fabsf.c
new file mode 100644
index 00000000..36ea7481
--- /dev/null
+++ b/src/math/x86_64/fabsf.c
@@ -0,0 +1,10 @@
+#include <math.h>
+
+float fabsf(float x)
+{
+	float t;
+	__asm__ ("pcmpeqd %0, %0" : "=x"(t));          // t = ~0
+	__asm__ ("psrld   $1, %0" : "+x"(t));          // t >>= 1
+	__asm__ ("andps   %1, %0" : "+x"(x) : "x"(t)); // x &= t
+	return x;
+}
diff --git a/src/math/x86_64/fabsf.s b/src/math/x86_64/fabsf.s
deleted file mode 100644
index 501a1f17..00000000
--- a/src/math/x86_64/fabsf.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global fabsf
-.type fabsf,@function
-fabsf:
-	mov $0x7fffffff,%eax
-	movq %rax,%xmm1
-	andps %xmm1,%xmm0
-	ret
diff --git a/src/math/x86_64/fabsl.c b/src/math/x86_64/fabsl.c
new file mode 100644
index 00000000..cc1c9ed9
--- /dev/null
+++ b/src/math/x86_64/fabsl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double fabsl(long double x)
+{
+	__asm__ ("fabs" : "+t"(x));
+	return x;
+}
diff --git a/src/math/x86_64/fabsl.s b/src/math/x86_64/fabsl.s
deleted file mode 100644
index 4e7ab525..00000000
--- a/src/math/x86_64/fabsl.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global fabsl
-.type fabsl,@function
-fabsl:
-	fldt 8(%rsp)
-	fabs
-	ret
diff --git a/src/math/x86_64/fmodl.c b/src/math/x86_64/fmodl.c
new file mode 100644
index 00000000..3daeab06
--- /dev/null
+++ b/src/math/x86_64/fmodl.c
@@ -0,0 +1,9 @@
+#include <math.h>
+
+long double fmodl(long double x, long double y)
+{
+	unsigned short fpsr;
+	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/x86_64/fmodl.s b/src/math/x86_64/fmodl.s
deleted file mode 100644
index ea07b402..00000000
--- a/src/math/x86_64/fmodl.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global fmodl
-.type fmodl,@function
-fmodl:
-	fldt 24(%rsp)
-	fldt 8(%rsp)
-1:	fprem
-	fnstsw %ax
-	testb $4,%ah
-	jnz 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/x86_64/llrint.c b/src/math/x86_64/llrint.c
new file mode 100644
index 00000000..dd38a722
--- /dev/null
+++ b/src/math/x86_64/llrint.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrint(double x)
+{
+	long long r;
+	__asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x));
+	return r;
+}
diff --git a/src/math/x86_64/llrint.s b/src/math/x86_64/llrint.s
deleted file mode 100644
index bf476498..00000000
--- a/src/math/x86_64/llrint.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global llrint
-.type llrint,@function
-llrint:
-	cvtsd2si %xmm0,%rax
-	ret
diff --git a/src/math/x86_64/llrintf.c b/src/math/x86_64/llrintf.c
new file mode 100644
index 00000000..fc8625e8
--- /dev/null
+++ b/src/math/x86_64/llrintf.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrintf(float x)
+{
+	long long r;
+	__asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x));
+	return r;
+}
diff --git a/src/math/x86_64/llrintf.s b/src/math/x86_64/llrintf.s
deleted file mode 100644
index d7204ac0..00000000
--- a/src/math/x86_64/llrintf.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global llrintf
-.type llrintf,@function
-llrintf:
-	cvtss2si %xmm0,%rax
-	ret
diff --git a/src/math/x86_64/llrintl.c b/src/math/x86_64/llrintl.c
new file mode 100644
index 00000000..c439ef28
--- /dev/null
+++ b/src/math/x86_64/llrintl.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrintl(long double x)
+{
+	long long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/x86_64/llrintl.s b/src/math/x86_64/llrintl.s
deleted file mode 100644
index 1ec0817d..00000000
--- a/src/math/x86_64/llrintl.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global llrintl
-.type llrintl,@function
-llrintl:
-	fldt 8(%rsp)
-	fistpll 8(%rsp)
-	mov 8(%rsp),%rax
-	ret
diff --git a/src/math/x86_64/lrint.c b/src/math/x86_64/lrint.c
new file mode 100644
index 00000000..a742fec6
--- /dev/null
+++ b/src/math/x86_64/lrint.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrint(double x)
+{
+	long r;
+	__asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x));
+	return r;
+}
diff --git a/src/math/x86_64/lrint.s b/src/math/x86_64/lrint.s
deleted file mode 100644
index 15fc2454..00000000
--- a/src/math/x86_64/lrint.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global lrint
-.type lrint,@function
-lrint:
-	cvtsd2si %xmm0,%rax
-	ret
diff --git a/src/math/x86_64/lrintf.c b/src/math/x86_64/lrintf.c
new file mode 100644
index 00000000..2ba5639d
--- /dev/null
+++ b/src/math/x86_64/lrintf.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrintf(float x)
+{
+	long r;
+	__asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x));
+	return r;
+}
diff --git a/src/math/x86_64/lrintf.s b/src/math/x86_64/lrintf.s
deleted file mode 100644
index 488423d2..00000000
--- a/src/math/x86_64/lrintf.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global lrintf
-.type lrintf,@function
-lrintf:
-	cvtss2si %xmm0,%rax
-	ret
diff --git a/src/math/x86_64/lrintl.c b/src/math/x86_64/lrintl.c
new file mode 100644
index 00000000..068e2e4d
--- /dev/null
+++ b/src/math/x86_64/lrintl.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrintl(long double x)
+{
+	long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/x86_64/lrintl.s b/src/math/x86_64/lrintl.s
deleted file mode 100644
index d587b12b..00000000
--- a/src/math/x86_64/lrintl.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global lrintl
-.type lrintl,@function
-lrintl:
-	fldt 8(%rsp)
-	fistpll 8(%rsp)
-	mov 8(%rsp),%rax
-	ret
diff --git a/src/math/x86_64/remainderl.c b/src/math/x86_64/remainderl.c
new file mode 100644
index 00000000..8cf75071
--- /dev/null
+++ b/src/math/x86_64/remainderl.c
@@ -0,0 +1,9 @@
+#include <math.h>
+
+long double remainderl(long double x, long double y)
+{
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/x86_64/remainderl.s b/src/math/x86_64/remainderl.s
deleted file mode 100644
index cb3857b4..00000000
--- a/src/math/x86_64/remainderl.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global remainderl
-.type remainderl,@function
-remainderl:
-	fldt 24(%rsp)
-	fldt 8(%rsp)
-1:	fprem1
-	fnstsw %ax
-	testb $4,%ah
-	jnz 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/x86_64/remquol.c b/src/math/x86_64/remquol.c
new file mode 100644
index 00000000..60eef089
--- /dev/null
+++ b/src/math/x86_64/remquol.c
@@ -0,0 +1,32 @@
+#include <math.h>
+
+long double remquol(long double x, long double y, int *quo)
+{
+	signed char *cx = (void *)&x, *cy = (void *)&y;
+	/* By ensuring that addresses of x and y cannot be discarded,
+	 * this empty asm guides GCC into representing extraction of
+	 * their sign bits as memory loads rather than making x and y
+	 * not-address-taken internally and using bitfield operations,
+	 * which in the end wouldn't work out, as extraction from FPU
+	 * registers needs to go through memory anyway. This way GCC
+	 * should manage to use incoming stack slots without spills. */
+	__asm__ ("" :: "X"(cx), "X"(cy));
+
+	long double t = x;
+	unsigned fpsr;
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(t), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	/* C0, C1, C3 flags in x87 status word carry low bits of quotient:
+	 * 15 14 13 12 11 10  9  8
+	 *  . C3  .  .  . C2 C1 C0
+	 *  . b1  .  .  .  0 b0 b2 */
+	unsigned char i = fpsr >> 8;
+	i = i>>4 | i<<4;
+	/* i[5:2] is now {b0 b2 ? b1}. Retrieve {0 b2 b1 b0} via
+	 * in-register table lookup. */
+	unsigned qbits = 0x7575313164642020 >> (i & 60);
+	qbits &= 7;
+
+	*quo = (cx[9]^cy[9]) < 0 ? -qbits : qbits;
+	return t;
+}
diff --git a/src/math/x86_64/rintl.c b/src/math/x86_64/rintl.c
new file mode 100644
index 00000000..e1a92077
--- /dev/null
+++ b/src/math/x86_64/rintl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double rintl(long double x)
+{
+	__asm__ ("frndint" : "+t"(x));
+	return x;
+}
diff --git a/src/math/x86_64/rintl.s b/src/math/x86_64/rintl.s
deleted file mode 100644
index 64e663cd..00000000
--- a/src/math/x86_64/rintl.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global rintl
-.type rintl,@function
-rintl:
-	fldt 8(%rsp)
-	frndint
-	ret
diff --git a/src/math/x86_64/sqrt.c b/src/math/x86_64/sqrt.c
new file mode 100644
index 00000000..657e09e3
--- /dev/null
+++ b/src/math/x86_64/sqrt.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+double sqrt(double x)
+{
+	__asm__ ("sqrtsd %1, %0" : "=x"(x) : "x"(x));
+	return x;
+}
diff --git a/src/math/x86_64/sqrt.s b/src/math/x86_64/sqrt.s
deleted file mode 100644
index d3c609f9..00000000
--- a/src/math/x86_64/sqrt.s
+++ /dev/null
@@ -1,4 +0,0 @@
-.global sqrt
-.type sqrt,@function
-sqrt:	sqrtsd %xmm0, %xmm0
-	ret
diff --git a/src/math/x86_64/sqrtf.c b/src/math/x86_64/sqrtf.c
new file mode 100644
index 00000000..720baec6
--- /dev/null
+++ b/src/math/x86_64/sqrtf.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+float sqrtf(float x)
+{
+	__asm__ ("sqrtss %1, %0" : "=x"(x) : "x"(x));
+	return x;
+}
diff --git a/src/math/x86_64/sqrtf.s b/src/math/x86_64/sqrtf.s
deleted file mode 100644
index eec48c60..00000000
--- a/src/math/x86_64/sqrtf.s
+++ /dev/null
@@ -1,4 +0,0 @@
-.global sqrtf
-.type sqrtf,@function
-sqrtf:  sqrtss %xmm0, %xmm0
-	ret
diff --git a/src/math/x86_64/sqrtl.c b/src/math/x86_64/sqrtl.c
new file mode 100644
index 00000000..864cfcc4
--- /dev/null
+++ b/src/math/x86_64/sqrtl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double sqrtl(long double x)
+{
+	__asm__ ("fsqrt" : "+t"(x));
+	return x;
+}
diff --git a/src/math/x86_64/sqrtl.s b/src/math/x86_64/sqrtl.s
deleted file mode 100644
index 23cd687d..00000000
--- a/src/math/x86_64/sqrtl.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global sqrtl
-.type sqrtl,@function
-sqrtl:	fldt 8(%rsp)
-	fsqrt
-	ret
diff --git a/src/misc/getentropy.c b/src/misc/getentropy.c
index d2f282ce..651ea95f 100644
--- a/src/misc/getentropy.c
+++ b/src/misc/getentropy.c
@@ -6,7 +6,7 @@
 
 int getentropy(void *buffer, size_t len)
 {
-	int cs, ret;
+	int cs, ret = 0;
 	char *pos = buffer;
 
 	if (len > 256) {
diff --git a/src/misc/getopt.c b/src/misc/getopt.c
index c3f66995..b02b81c3 100644
--- a/src/misc/getopt.c
+++ b/src/misc/getopt.c
@@ -87,7 +87,8 @@ int getopt(int argc, char * const argv[], const char *optstring)
 	if (optstring[i] == ':') {
 		optarg = 0;
 		if (optstring[i+1] != ':' || optpos) {
-			optarg = argv[optind++] + optpos;
+			optarg = argv[optind++];
+			if (optpos) optarg += optpos;
 			optpos = 0;
 		}
 		if (optind > argc) {
diff --git a/src/misc/getrlimit.c b/src/misc/getrlimit.c
index 2ab2f0f4..a5558d81 100644
--- a/src/misc/getrlimit.c
+++ b/src/misc/getrlimit.c
@@ -6,12 +6,13 @@
 
 int getrlimit(int resource, struct rlimit *rlim)
 {
-	unsigned long k_rlim[2];
 	int ret = syscall(SYS_prlimit64, 0, resource, 0, rlim);
 	if (!ret) {
 		FIX(rlim->rlim_cur);
 		FIX(rlim->rlim_max);
 	}
+#ifdef SYS_getrlimit
+	unsigned long k_rlim[2];
 	if (!ret || errno != ENOSYS)
 		return ret;
 	if (syscall(SYS_getrlimit, resource, k_rlim) < 0)
@@ -21,6 +22,7 @@ int getrlimit(int resource, struct rlimit *rlim)
 	FIX(rlim->rlim_cur);
 	FIX(rlim->rlim_max);
 	return 0;
+#else
+	return ret;
+#endif
 }
-
-weak_alias(getrlimit, getrlimit64);
diff --git a/src/misc/initgroups.c b/src/misc/initgroups.c
index 922a9581..101f5c7b 100644
--- a/src/misc/initgroups.c
+++ b/src/misc/initgroups.c
@@ -1,11 +1,29 @@
 #define _GNU_SOURCE
 #include <grp.h>
 #include <limits.h>
+#include <stdlib.h>
 
 int initgroups(const char *user, gid_t gid)
 {
-	gid_t groups[NGROUPS_MAX];
-	int count = NGROUPS_MAX;
-	if (getgrouplist(user, gid, groups, &count) < 0) return -1;
-	return setgroups(count, groups);
+	gid_t buf[32], *groups = buf;
+	int count = sizeof buf / sizeof *buf, prev_count = count;
+	while (getgrouplist(user, gid, groups, &count) < 0) {
+		if (groups != buf) free(groups);
+
+		/* Return if failure isn't buffer size */
+		if (count <= prev_count)
+			return -1;
+
+		/* Always increase by at least 50% to limit to
+		 * logarithmically many retries on TOCTOU races. */
+		if (count < prev_count + (prev_count>>1))
+			count = prev_count + (prev_count>>1);
+
+		groups = calloc(count, sizeof *groups);
+		if (!groups) return -1;
+		prev_count = count;
+	}
+	int ret = setgroups(count, groups);
+	if (groups != buf) free(groups);
+	return ret;
 }
diff --git a/src/misc/ioctl.c b/src/misc/ioctl.c
index 89477511..35804f02 100644
--- a/src/misc/ioctl.c
+++ b/src/misc/ioctl.c
@@ -4,7 +4,9 @@
 #include <time.h>
 #include <sys/time.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <string.h>
+#include <endian.h>
 #include "syscall.h"
 
 #define alignof(t) offsetof(struct { char c; t x; }, x)
@@ -28,6 +30,12 @@ struct ioctl_compat_map {
  * number producing macros; only size of result is meaningful. */
 #define new_misaligned(n) struct { int i; time_t t; char c[(n)-4]; }
 
+struct v4l2_event {
+	uint32_t a;
+	uint64_t b[8];
+	uint32_t c[2], ts[2], d[9];
+};
+
 static const struct ioctl_compat_map compat_map[] = {
 	{ SIOCGSTAMP, SIOCGSTAMP_OLD, 8, R, 0, OFFS(0, 4) },
 	{ SIOCGSTAMPNS, SIOCGSTAMPNS_OLD, 8, R, 0, OFFS(0, 4) },
@@ -46,16 +54,17 @@ static const struct ioctl_compat_map compat_map[] = {
 	{ _IOWR('A', 0x23, char[136]), _IOWR('A', 0x23, char[132]), 0, WR, 1, 0 },
 	{ 0, 0, 4, WR, 1, 0 }, /* snd_pcm_sync_ptr (flags only) */
 	{ 0, 0, 32, WR, 1, OFFS(8,12,16,24,28) }, /* snd_pcm_mmap_status */
-	{ 0, 0, 8, WR, 1, OFFS(0,4) }, /* snd_pcm_mmap_control */
+	{ 0, 0, 4, WR, 1, 0 }, /* snd_pcm_mmap_control (each member) */
 
 	/* VIDIOC_QUERYBUF, VIDIOC_QBUF, VIDIOC_DQBUF, VIDIOC_PREPARE_BUF */
-	{ _IOWR('V',  9, new_misaligned(72)), _IOWR('V',  9, char[72]), 72, WR, 0, OFFS(20) },
-	{ _IOWR('V', 15, new_misaligned(72)), _IOWR('V', 15, char[72]), 72, WR, 0, OFFS(20) },
-	{ _IOWR('V', 17, new_misaligned(72)), _IOWR('V', 17, char[72]), 72, WR, 0, OFFS(20) },
-	{ _IOWR('V', 93, new_misaligned(72)), _IOWR('V', 93, char[72]), 72, WR, 0, OFFS(20) },
+	{ _IOWR('V',  9, new_misaligned(68)), _IOWR('V',  9, char[68]), 68, WR, 1, OFFS(20, 24) },
+	{ _IOWR('V', 15, new_misaligned(68)), _IOWR('V', 15, char[68]), 68, WR, 1, OFFS(20, 24) },
+	{ _IOWR('V', 17, new_misaligned(68)), _IOWR('V', 17, char[68]), 68, WR, 1, OFFS(20, 24) },
+	{ _IOWR('V', 93, new_misaligned(68)), _IOWR('V', 93, char[68]), 68, WR, 1, OFFS(20, 24) },
 
 	/* VIDIOC_DQEVENT */
-	{ _IOR('V', 89, new_misaligned(96)), _IOR('V', 89, char[96]), 96, R, 0, OFFS(76,80) },
+	{ _IOR('V', 89, new_misaligned(120)), _IOR('V', 89, struct v4l2_event), sizeof(struct v4l2_event),
+	  R, 0, OFFS(offsetof(struct v4l2_event, ts[0]), offsetof(struct v4l2_event, ts[1])) },
 
 	/* VIDIOC_OMAP3ISP_STAT_REQ */
 	{ _IOWR('V', 192+6, char[32]), _IOWR('V', 192+6, char[24]), 22, WR, 0, OFFS(0,4) },
@@ -82,7 +91,11 @@ static void convert_ioctl_struct(const struct ioctl_compat_map *map, char *old,
 		 * if another exception appears this needs changing. */
 		convert_ioctl_struct(map+1, old, new, dir);
 		convert_ioctl_struct(map+2, old+4, new+8, dir);
-		convert_ioctl_struct(map+3, old+68, new+72, dir);
+		/* snd_pcm_mmap_control, special-cased due to kernel
+		 * type definition having been botched. */
+		int adj = BYTE_ORDER==BIG_ENDIAN ? 4 : 0;
+		convert_ioctl_struct(map+3, old+68, new+72+adj, dir);
+		convert_ioctl_struct(map+3, old+72, new+76+3*adj, dir);
 		return;
 	}
 	for (int i=0; i < map->noffs; i++) {
diff --git a/src/misc/lockf.c b/src/misc/lockf.c
index 16a80bec..0162442b 100644
--- a/src/misc/lockf.c
+++ b/src/misc/lockf.c
@@ -28,5 +28,3 @@ int lockf(int fd, int op, off_t size)
 	errno = EINVAL;
 	return -1;
 }
-
-weak_alias(lockf, lockf64);
diff --git a/src/misc/mntent.c b/src/misc/mntent.c
index eabb8200..76f9c162 100644
--- a/src/misc/mntent.c
+++ b/src/misc/mntent.c
@@ -2,6 +2,7 @@
 #include <string.h>
 #include <mntent.h>
 #include <errno.h>
+#include <limits.h>
 
 static char *internal_buf;
 static size_t internal_bufsize;
@@ -19,9 +20,46 @@ int endmntent(FILE *f)
 	return 1;
 }
 
+static char *unescape_ent(char *beg)
+{
+	char *dest = beg;
+	const char *src = beg;
+	while (*src) {
+		const char *val;
+		unsigned char cval = 0;
+		if (*src != '\\') {
+			*dest++ = *src++;
+			continue;
+		}
+		if (src[1] == '\\') {
+			++src;
+			*dest++ = *src++;
+			continue;
+		}
+		val = src + 1;
+		for (int i = 0; i < 3; ++i) {
+			if (*val >= '0' && *val <= '7') {
+				cval <<= 3;
+				cval += *val++ - '0';
+			} else {
+				break;
+			}
+		}
+		if (cval) {
+			*dest++ = cval;
+			src = val;
+		} else {
+			*dest++ = *src++;
+		}
+	}
+	*dest = 0;
+	return beg;
+}
+
 struct mntent *getmntent_r(FILE *f, struct mntent *mnt, char *linebuf, int buflen)
 {
-	int cnt, n[8], use_internal = (linebuf == SENTINEL);
+	int n[8], use_internal = (linebuf == SENTINEL);
+	size_t len, i;
 
 	mnt->mnt_freq = 0;
 	mnt->mnt_passno = 0;
@@ -39,20 +77,24 @@ struct mntent *getmntent_r(FILE *f, struct mntent *mnt, char *linebuf, int bufle
 			errno = ERANGE;
 			return 0;
 		}
-		cnt = sscanf(linebuf, " %n%*s%n %n%*s%n %n%*s%n %n%*s%n %d %d",
+
+		len = strlen(linebuf);
+		if (len > INT_MAX) continue;
+		for (i = 0; i < sizeof n / sizeof *n; i++) n[i] = len;
+		sscanf(linebuf, " %n%*[^ \t\n]%n %n%*[^ \t\n]%n %n%*[^ \t\n]%n %n%*[^ \t\n]%n %d %d",
 			n, n+1, n+2, n+3, n+4, n+5, n+6, n+7,
 			&mnt->mnt_freq, &mnt->mnt_passno);
-	} while (cnt < 2 || linebuf[n[0]] == '#');
+	} while (linebuf[n[0]] == '#' || n[1]==len);
 
 	linebuf[n[1]] = 0;
 	linebuf[n[3]] = 0;
 	linebuf[n[5]] = 0;
 	linebuf[n[7]] = 0;
 
-	mnt->mnt_fsname = linebuf+n[0];
-	mnt->mnt_dir = linebuf+n[2];
-	mnt->mnt_type = linebuf+n[4];
-	mnt->mnt_opts = linebuf+n[6];
+	mnt->mnt_fsname = unescape_ent(linebuf+n[0]);
+	mnt->mnt_dir = unescape_ent(linebuf+n[2]);
+	mnt->mnt_type = unescape_ent(linebuf+n[4]);
+	mnt->mnt_opts = unescape_ent(linebuf+n[6]);
 
 	return mnt;
 }
@@ -73,5 +115,13 @@ int addmntent(FILE *f, const struct mntent *mnt)
 
 char *hasmntopt(const struct mntent *mnt, const char *opt)
 {
-	return strstr(mnt->mnt_opts, opt);
+	size_t l = strlen(opt);
+	char *p = mnt->mnt_opts;
+	for (;;) {
+		if (!strncmp(p, opt, l) && (!p[l] || p[l]==',' || p[l]=='='))
+			return p;
+		p = strchr(p, ',');
+		if (!p) return 0;
+		p++;
+	}
 }
diff --git a/src/misc/nftw.c b/src/misc/nftw.c
index 0a464100..71bc62ee 100644
--- a/src/misc/nftw.c
+++ b/src/misc/nftw.c
@@ -1,5 +1,6 @@
 #include <ftw.h>
 #include <dirent.h>
+#include <fcntl.h>
 #include <sys/stat.h>
 #include <errno.h>
 #include <unistd.h>
@@ -26,16 +27,19 @@ static int do_nftw(char *path, int (*fn)(const char *, const struct stat *, int,
 	struct history new;
 	int type;
 	int r;
+	int dfd;
+	int err;
 	struct FTW lev;
 
+	st.st_dev = st.st_ino = 0;
+
 	if ((flags & FTW_PHYS) ? lstat(path, &st) : stat(path, &st) < 0) {
 		if (!(flags & FTW_PHYS) && errno==ENOENT && !lstat(path, &st))
 			type = FTW_SLN;
 		else if (errno != EACCES) return -1;
 		else type = FTW_NS;
 	} else if (S_ISDIR(st.st_mode)) {
-		if (access(path, R_OK) < 0) type = FTW_DNR;
-		else if (flags & FTW_DEPTH) type = FTW_DP;
+		if (flags & FTW_DEPTH) type = FTW_DP;
 		else type = FTW_D;
 	} else if (S_ISLNK(st.st_mode)) {
 		if (flags & FTW_PHYS) type = FTW_SL;
@@ -44,7 +48,7 @@ static int do_nftw(char *path, int (*fn)(const char *, const struct stat *, int,
 		type = FTW_F;
 	}
 
-	if ((flags & FTW_MOUNT) && h && st.st_dev != h->dev)
+	if ((flags & FTW_MOUNT) && h && type != FTW_NS && st.st_dev != h->dev)
 		return 0;
 	
 	new.chain = h;
@@ -63,6 +67,13 @@ static int do_nftw(char *path, int (*fn)(const char *, const struct stat *, int,
 		lev.base = k;
 	}
 
+	if (type == FTW_D || type == FTW_DP) {
+		dfd = open(path, O_RDONLY);
+		err = errno;
+		if (dfd < 0 && err == EACCES) type = FTW_DNR;
+		if (!fd_limit) close(dfd);
+	}
+
 	if (!(flags & FTW_DEPTH) && (r=fn(path, &st, type, &lev)))
 		return r;
 
@@ -71,7 +82,11 @@ static int do_nftw(char *path, int (*fn)(const char *, const struct stat *, int,
 			return 0;
 
 	if ((type == FTW_D || type == FTW_DP) && fd_limit) {
-		DIR *d = opendir(path);
+		if (dfd < 0) {
+			errno = err;
+			return -1;
+		}
+		DIR *d = fdopendir(dfd);
 		if (d) {
 			struct dirent *de;
 			while ((de = readdir(d))) {
@@ -92,7 +107,8 @@ static int do_nftw(char *path, int (*fn)(const char *, const struct stat *, int,
 				}
 			}
 			closedir(d);
-		} else if (errno != EACCES) {
+		} else {
+			close(dfd);
 			return -1;
 		}
 	}
@@ -124,5 +140,3 @@ int nftw(const char *path, int (*fn)(const char *, const struct stat *, int, str
 	pthread_setcancelstate(cs, 0);
 	return r;
 }
-
-weak_alias(nftw, nftw64);
diff --git a/src/misc/realpath.c b/src/misc/realpath.c
index d2708e59..db8b74dc 100644
--- a/src/misc/realpath.c
+++ b/src/misc/realpath.c
@@ -1,43 +1,156 @@
 #include <stdlib.h>
 #include <limits.h>
-#include <sys/stat.h>
-#include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
 #include <string.h>
-#include "syscall.h"
+
+static size_t slash_len(const char *s)
+{
+	const char *s0 = s;
+	while (*s == '/') s++;
+	return s-s0;
+}
 
 char *realpath(const char *restrict filename, char *restrict resolved)
 {
-	int fd;
-	ssize_t r;
-	struct stat st1, st2;
-	char buf[15+3*sizeof(int)];
-	char tmp[PATH_MAX];
+	char stack[PATH_MAX+1];
+	char output[PATH_MAX];
+	size_t p, q, l, l0, cnt=0, nup=0;
+	int check_dir=0;
 
 	if (!filename) {
 		errno = EINVAL;
 		return 0;
 	}
+	l = strnlen(filename, sizeof stack);
+	if (!l) {
+		errno = ENOENT;
+		return 0;
+	}
+	if (l >= PATH_MAX) goto toolong;
+	p = sizeof stack - l - 1;
+	q = 0;
+	memcpy(stack+p, filename, l+1);
+
+	/* Main loop. Each iteration pops the next part from stack of
+	 * remaining path components and consumes any slashes that follow.
+	 * If not a link, it's moved to output; if a link, contents are
+	 * pushed to the stack. */
+restart:
+	for (; ; p+=slash_len(stack+p)) {
+		/* If stack starts with /, the whole component is / or //
+		 * and the output state must be reset. */
+		if (stack[p] == '/') {
+			check_dir=0;
+			nup=0;
+			q=0;
+			output[q++] = '/';
+			p++;
+			/* Initial // is special. */
+			if (stack[p] == '/' && stack[p+1] != '/')
+				output[q++] = '/';
+			continue;
+		}
+
+		char *z = __strchrnul(stack+p, '/');
+		l0 = l = z-(stack+p);
 
-	fd = sys_open(filename, O_PATH|O_NONBLOCK|O_CLOEXEC);
-	if (fd < 0) return 0;
-	__procfdname(buf, fd);
+		if (!l && !check_dir) break;
 
-	r = readlink(buf, tmp, sizeof tmp - 1);
-	if (r < 0) goto err;
-	tmp[r] = 0;
+		/* Skip any . component but preserve check_dir status. */
+		if (l==1 && stack[p]=='.') {
+			p += l;
+			continue;
+		}
 
-	fstat(fd, &st1);
-	r = stat(tmp, &st2);
-	if (r<0 || st1.st_dev != st2.st_dev || st1.st_ino != st2.st_ino) {
-		if (!r) errno = ELOOP;
-		goto err;
+		/* Copy next component onto output at least temporarily, to
+		 * call readlink, but wait to advance output position until
+		 * determining it's not a link. */
+		if (q && output[q-1] != '/') {
+			if (!p) goto toolong;
+			stack[--p] = '/';
+			l++;
+		}
+		if (q+l >= PATH_MAX) goto toolong;
+		memcpy(output+q, stack+p, l);
+		output[q+l] = 0;
+		p += l;
+
+		int up = 0;
+		if (l0==2 && stack[p-2]=='.' && stack[p-1]=='.') {
+			up = 1;
+			/* Any non-.. path components we could cancel start
+			 * after nup repetitions of the 3-byte string "../";
+			 * if there are none, accumulate .. components to
+			 * later apply to cwd, if needed. */
+			if (q <= 3*nup) {
+				nup++;
+				q += l;
+				continue;
+			}
+			/* When previous components are already known to be
+			 * directories, processing .. can skip readlink. */
+			if (!check_dir) goto skip_readlink;
+		}
+		ssize_t k = readlink(output, stack, p);
+		if (k==p) goto toolong;
+		if (!k) {
+			errno = ENOENT;
+			return 0;
+		}
+		if (k<0) {
+			if (errno != EINVAL) return 0;
+skip_readlink:
+			check_dir = 0;
+			if (up) {
+				while(q && output[q-1]!='/') q--;
+				if (q>1 && (q>2 || output[0]!='/')) q--;
+				continue;
+			}
+			if (l0) q += l;
+			check_dir = stack[p];
+			continue;
+		}
+		if (++cnt == SYMLOOP_MAX) {
+			errno = ELOOP;
+			return 0;
+		}
+
+		/* If link contents end in /, strip any slashes already on
+		 * stack to avoid /->// or //->/// or spurious toolong. */
+		if (stack[k-1]=='/') while (stack[p]=='/') p++;
+		p -= k;
+		memmove(stack+p, stack, k);
+
+		/* Skip the stack advancement in case we have a new
+		 * absolute base path. */
+		goto restart;
 	}
 
-	__syscall(SYS_close, fd);
-	return resolved ? strcpy(resolved, tmp) : strdup(tmp);
-err:
-	__syscall(SYS_close, fd);
+ 	output[q] = 0;
+
+	if (output[0] != '/') {
+		if (!getcwd(stack, sizeof stack)) return 0;
+		l = strlen(stack);
+		/* Cancel any initial .. components. */
+		p = 0;
+		while (nup--) {
+			while(l>1 && stack[l-1]!='/') l--;
+			if (l>1) l--;
+			p += 2;
+			if (p<q) p++;
+		}
+		if (q-p && stack[l-1]!='/') stack[l++] = '/';
+		if (l + (q-p) + 1 >= PATH_MAX) goto toolong;
+		memmove(output + l, output + p, q - p + 1);
+		memcpy(output, stack, l);
+		q = l + q-p;
+	}
+
+	if (resolved) return memcpy(resolved, output, q+1);
+	else return strdup(output);
+
+toolong:
+	errno = ENAMETOOLONG;
 	return 0;
 }
diff --git a/src/misc/setrlimit.c b/src/misc/setrlimit.c
index 7a66ab29..edb413fa 100644
--- a/src/misc/setrlimit.c
+++ b/src/misc/setrlimit.c
@@ -6,45 +6,46 @@
 #define MIN(a, b) ((a)<(b) ? (a) : (b))
 #define FIX(x) do{ if ((x)>=SYSCALL_RLIM_INFINITY) (x)=RLIM_INFINITY; }while(0)
 
-static int __setrlimit(int resource, const struct rlimit *rlim)
-{
-	unsigned long k_rlim[2];
-	struct rlimit tmp;
-	if (SYSCALL_RLIM_INFINITY != RLIM_INFINITY) {
-		tmp = *rlim;
-		FIX(tmp.rlim_cur);
-		FIX(tmp.rlim_max);
-		rlim = &tmp;
-	}
-	int ret = __syscall(SYS_prlimit64, 0, resource, rlim, 0);
-	if (ret != -ENOSYS) return ret;
-	k_rlim[0] = MIN(rlim->rlim_cur, MIN(-1UL, SYSCALL_RLIM_INFINITY));
-	k_rlim[1] = MIN(rlim->rlim_max, MIN(-1UL, SYSCALL_RLIM_INFINITY));
-	return __syscall(SYS_setrlimit, resource, k_rlim);
-}
-
 struct ctx {
-	const struct rlimit *rlim;
+	unsigned long lim[2];
 	int res;
 	int err;
 };
 
+#ifdef SYS_setrlimit
 static void do_setrlimit(void *p)
 {
 	struct ctx *c = p;
 	if (c->err>0) return;
-	c->err = -__setrlimit(c->res, c->rlim);
+	c->err = -__syscall(SYS_setrlimit, c->res, c->lim);
 }
+#endif
 
 int setrlimit(int resource, const struct rlimit *rlim)
 {
-	struct ctx c = { .res = resource, .rlim = rlim, .err = -1 };
+	struct rlimit tmp;
+	if (SYSCALL_RLIM_INFINITY != RLIM_INFINITY) {
+		tmp = *rlim;
+		FIX(tmp.rlim_cur);
+		FIX(tmp.rlim_max);
+		rlim = &tmp;
+	}
+	int ret = __syscall(SYS_prlimit64, 0, resource, rlim, 0);
+#ifdef SYS_setrlimit
+	if (ret != -ENOSYS) return __syscall_ret(ret);
+
+	struct ctx c = {
+		.lim[0] = MIN(rlim->rlim_cur, MIN(-1UL, SYSCALL_RLIM_INFINITY)),
+		.lim[1] = MIN(rlim->rlim_max, MIN(-1UL, SYSCALL_RLIM_INFINITY)),
+		.res = resource, .err = -1
+	};
 	__synccall(do_setrlimit, &c);
 	if (c.err) {
 		if (c.err>0) errno = c.err;
 		return -1;
 	}
 	return 0;
+#else
+	return __syscall_ret(ret);
+#endif
 }
-
-weak_alias(setrlimit, setrlimit64);
diff --git a/src/misc/syslog.c b/src/misc/syslog.c
index 13d4b0a6..710202f9 100644
--- a/src/misc/syslog.c
+++ b/src/misc/syslog.c
@@ -10,6 +10,8 @@
 #include <errno.h>
 #include <fcntl.h>
 #include "lock.h"
+#include "fork_impl.h"
+#include "locale_impl.h"
 
 static volatile int lock[1];
 static char log_ident[32];
@@ -17,6 +19,7 @@ static int log_opt;
 static int log_facility = LOG_USER;
 static int log_mask = 0xff;
 static int log_fd = -1;
+volatile int *const __syslog_lockptr = lock;
 
 int setlogmask(int maskpri)
 {
@@ -97,7 +100,7 @@ static void _vsyslog(int priority, const char *message, va_list ap)
 
 	now = time(NULL);
 	gmtime_r(&now, &tm);
-	strftime(timebuf, sizeof timebuf, "%b %e %T", &tm);
+	strftime_l(timebuf, sizeof timebuf, "%b %e %T", &tm, C_LOCALE);
 
 	pid = (log_opt & LOG_PID) ? getpid() : 0;
 	l = snprintf(buf, sizeof buf, "<%d>%s %n%s%s%.0d%s: ",
diff --git a/src/mman/mmap.c b/src/mman/mmap.c
index eff88d82..43e5e029 100644
--- a/src/mman/mmap.c
+++ b/src/mman/mmap.c
@@ -37,5 +37,3 @@ void *__mmap(void *start, size_t len, int prot, int flags, int fd, off_t off)
 }
 
 weak_alias(__mmap, mmap);
-
-weak_alias(mmap, mmap64);
diff --git a/src/mq/mq_notify.c b/src/mq/mq_notify.c
index 221591c7..0e1e6c7a 100644
--- a/src/mq/mq_notify.c
+++ b/src/mq/mq_notify.c
@@ -4,11 +4,14 @@
 #include <sys/socket.h>
 #include <signal.h>
 #include <unistd.h>
+#include <semaphore.h>
 #include "syscall.h"
 
 struct args {
-	pthread_barrier_t barrier;
+	sem_t sem;
 	int sock;
+	mqd_t mqd;
+	int err;
 	const struct sigevent *sev;
 };
 
@@ -20,8 +23,19 @@ static void *start(void *p)
 	int s = args->sock;
 	void (*func)(union sigval) = args->sev->sigev_notify_function;
 	union sigval val = args->sev->sigev_value;
+	struct sigevent sev2;
+	static const char zeros[32];
+	int err;
+
+	sev2.sigev_notify = SIGEV_THREAD;
+	sev2.sigev_signo = s;
+	sev2.sigev_value.sival_ptr = (void *)&zeros;
+
+	args->err = err = -__syscall(SYS_mq_notify, args->mqd, &sev2);
+	sem_post(&args->sem);
+	if (err) return 0;
 
-	pthread_barrier_wait(&args->barrier);
+	pthread_detach(pthread_self());
 	n = recv(s, buf, sizeof(buf), MSG_NOSIGNAL|MSG_WAITALL);
 	close(s);
 	if (n==sizeof buf && buf[sizeof buf - 1] == 1)
@@ -35,8 +49,8 @@ int mq_notify(mqd_t mqd, const struct sigevent *sev)
 	pthread_attr_t attr;
 	pthread_t td;
 	int s;
-	struct sigevent sev2;
-	static const char zeros[32];
+	int cs;
+	sigset_t allmask, origmask;
 
 	if (!sev || sev->sigev_notify != SIGEV_THREAD)
 		return syscall(SYS_mq_notify, mqd, sev);
@@ -44,30 +58,35 @@ int mq_notify(mqd_t mqd, const struct sigevent *sev)
 	s = socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC, 0);
 	if (s < 0) return -1;
 	args.sock = s;
+	args.mqd = mqd;
 
 	if (sev->sigev_notify_attributes) attr = *sev->sigev_notify_attributes;
 	else pthread_attr_init(&attr);
-	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
-	pthread_barrier_init(&args.barrier, 0, 2);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+	sem_init(&args.sem, 0, 0);
 
+	sigfillset(&allmask);
+	pthread_sigmask(SIG_BLOCK, &allmask, &origmask);
 	if (pthread_create(&td, &attr, start, &args)) {
 		__syscall(SYS_close, s);
+		pthread_sigmask(SIG_SETMASK, &origmask, 0);
 		errno = EAGAIN;
 		return -1;
 	}
+	pthread_sigmask(SIG_SETMASK, &origmask, 0);
 
-	pthread_barrier_wait(&args.barrier);
-	pthread_barrier_destroy(&args.barrier);
-
-	sev2.sigev_notify = SIGEV_THREAD;
-	sev2.sigev_signo = s;
-	sev2.sigev_value.sival_ptr = (void *)&zeros;
+	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
+	sem_wait(&args.sem);
+	sem_destroy(&args.sem);
 
-	if (syscall(SYS_mq_notify, mqd, &sev2) < 0) {
-		pthread_cancel(td);
+	if (args.err) {
 		__syscall(SYS_close, s);
+		pthread_join(td, 0);
+		pthread_setcancelstate(cs, 0);
+		errno = args.err;
 		return -1;
 	}
 
+	pthread_setcancelstate(cs, 0);
 	return 0;
 }
diff --git a/src/mq/x32/mq_open.c b/src/mq/x32/mq_open.c
new file mode 100644
index 00000000..23481959
--- /dev/null
+++ b/src/mq/x32/mq_open.c
@@ -0,0 +1,22 @@
+#include <mqueue.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include "syscall.h"
+
+mqd_t mq_open(const char *name, int flags, ...)
+{
+	mode_t mode = 0;
+	struct mq_attr *attr = 0;
+	long long attrbuf[8];
+	if (*name == '/') name++;
+	if (flags & O_CREAT) {
+		va_list ap;
+		va_start(ap, flags);
+		mode = va_arg(ap, mode_t);
+		attr = va_arg(ap, struct mq_attr *);
+		if (attr) for (int i=0; i<8; i++)
+			attrbuf[i] = *(long *)((char *)attr + i*sizeof(long));
+		va_end(ap);
+	}
+	return syscall(SYS_mq_open, name, flags, mode, attr?attrbuf:0);
+}
diff --git a/src/mq/x32/mq_setattr.c b/src/mq/x32/mq_setattr.c
new file mode 100644
index 00000000..0c631175
--- /dev/null
+++ b/src/mq/x32/mq_setattr.c
@@ -0,0 +1,14 @@
+#include <mqueue.h>
+#include "syscall.h"
+
+int mq_setattr(mqd_t mqd, const struct mq_attr *restrict new, struct mq_attr *restrict old)
+{
+	long long attr[8];
+	if (new) for (int i=0; i<8; i++)
+		attr[i] = *(long *)((char *)new + i*sizeof(long));
+	int ret = __syscall(SYS_mq_getsetattr, mqd, new?attr:0, old?attr:0);
+	if (ret < 0) return __syscall_ret(ret);
+	if (old) for (int i=0; i<8; i++)
+		*(long *)((char *)old + i*sizeof(long)) = attr[i];
+	return 0;
+}
diff --git a/src/multibyte/mbrtowc.c b/src/multibyte/mbrtowc.c
index c94819e7..7824997e 100644
--- a/src/multibyte/mbrtowc.c
+++ b/src/multibyte/mbrtowc.c
@@ -8,7 +8,7 @@ size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate
 	static unsigned internal_state;
 	unsigned c;
 	const unsigned char *s = (const void *)src;
-	const unsigned N = n;
+	const size_t N = n;
 	wchar_t dummy;
 
 	if (!st) st = (void *)&internal_state;
diff --git a/src/multibyte/mbsnrtowcs.c b/src/multibyte/mbsnrtowcs.c
index 931192e2..47cbdc00 100644
--- a/src/multibyte/mbsnrtowcs.c
+++ b/src/multibyte/mbsnrtowcs.c
@@ -2,11 +2,13 @@
 
 size_t mbsnrtowcs(wchar_t *restrict wcs, const char **restrict src, size_t n, size_t wn, mbstate_t *restrict st)
 {
+	static unsigned internal_state;
 	size_t l, cnt=0, n2;
 	wchar_t *ws, wbuf[256];
 	const char *s = *src;
 	const char *tmp_s;
 
+	if (!st) st = (void *)&internal_state;
 	if (!wcs) ws = wbuf, wn = sizeof wbuf / sizeof *wbuf;
 	else ws = wcs;
 
@@ -41,8 +43,8 @@ size_t mbsnrtowcs(wchar_t *restrict wcs, const char **restrict src, size_t n, si
 				s = 0;
 				break;
 			}
-			/* have to roll back partial character */
-			*(unsigned *)st = 0;
+			s += n;
+			n -= n;
 			break;
 		}
 		s += l; n -= l;
diff --git a/src/multibyte/wcsnrtombs.c b/src/multibyte/wcsnrtombs.c
index 676932b5..95e25e70 100644
--- a/src/multibyte/wcsnrtombs.c
+++ b/src/multibyte/wcsnrtombs.c
@@ -1,41 +1,33 @@
 #include <wchar.h>
+#include <limits.h>
+#include <string.h>
 
 size_t wcsnrtombs(char *restrict dst, const wchar_t **restrict wcs, size_t wn, size_t n, mbstate_t *restrict st)
 {
-	size_t l, cnt=0, n2;
-	char *s, buf[256];
 	const wchar_t *ws = *wcs;
-	const wchar_t *tmp_ws;
-
-	if (!dst) s = buf, n = sizeof buf;
-	else s = dst;
-
-	while ( ws && n && ( (n2=wn)>=n || n2>32 ) ) {
-		if (n2>=n) n2=n;
-		tmp_ws = ws;
-		l = wcsrtombs(s, &ws, n2, 0);
-		if (!(l+1)) {
-			cnt = l;
-			n = 0;
+	size_t cnt = 0;
+	if (!dst) n=0;
+	while (ws && wn) {
+		char tmp[MB_LEN_MAX];
+		size_t l = wcrtomb(n<MB_LEN_MAX ? tmp : dst, *ws, 0);
+		if (l==-1) {
+			cnt = -1;
 			break;
 		}
-		if (s != buf) {
-			s += l;
+		if (dst) {
+			if (n<MB_LEN_MAX) {
+				if (l>n) break;
+				memcpy(dst, tmp, l);
+			}
+			dst += l;
 			n -= l;
 		}
-		wn = ws ? wn - (ws - tmp_ws) : 0;
-		cnt += l;
-	}
-	if (ws) while (n && wn) {
-		l = wcrtomb(s, *ws, 0);
-		if ((l+1)<=1) {
-			if (!l) ws = 0;
-			else cnt = l;
+		if (!*ws) {
+			ws = 0;
 			break;
 		}
-		ws++; wn--;
-		/* safe - this loop runs fewer than sizeof(buf) times */
-		s+=l; n-=l;
+		ws++;
+		wn--;
 		cnt += l;
 	}
 	if (dst) *wcs = ws;
diff --git a/src/network/accept4.c b/src/network/accept4.c
index 59ab1726..765a38ed 100644
--- a/src/network/accept4.c
+++ b/src/network/accept4.c
@@ -9,6 +9,10 @@ int accept4(int fd, struct sockaddr *restrict addr, socklen_t *restrict len, int
 	if (!flg) return accept(fd, addr, len);
 	int ret = socketcall_cp(accept4, fd, addr, len, flg, 0, 0);
 	if (ret>=0 || (errno != ENOSYS && errno != EINVAL)) return ret;
+	if (flg & ~(SOCK_CLOEXEC|SOCK_NONBLOCK)) {
+		errno = EINVAL;
+		return -1;
+	}
 	ret = accept(fd, addr, len);
 	if (ret<0) return ret;
 	if (flg & SOCK_CLOEXEC)
diff --git a/src/network/dns_parse.c b/src/network/dns_parse.c
index e6ee19d9..09813112 100644
--- a/src/network/dns_parse.c
+++ b/src/network/dns_parse.c
@@ -1,7 +1,7 @@
 #include <string.h>
 #include "lookup.h"
 
-int __dns_parse(const unsigned char *r, int rlen, int (*callback)(void *, int, const void *, int, const void *), void *ctx)
+int __dns_parse(const unsigned char *r, int rlen, int (*callback)(void *, int, const void *, int, const void *, int), void *ctx)
 {
 	int qdcount, ancount;
 	const unsigned char *p;
@@ -12,21 +12,20 @@ int __dns_parse(const unsigned char *r, int rlen, int (*callback)(void *, int, c
 	p = r+12;
 	qdcount = r[4]*256 + r[5];
 	ancount = r[6]*256 + r[7];
-	if (qdcount+ancount > 64) return -1;
 	while (qdcount--) {
 		while (p-r < rlen && *p-1U < 127) p++;
-		if (*p>193 || (*p==193 && p[1]>254) || p>r+rlen-6)
+		if (p>r+rlen-6)
 			return -1;
 		p += 5 + !!*p;
 	}
 	while (ancount--) {
 		while (p-r < rlen && *p-1U < 127) p++;
-		if (*p>193 || (*p==193 && p[1]>254) || p>r+rlen-6)
+		if (p>r+rlen-12)
 			return -1;
 		p += 1 + !!*p;
 		len = p[8]*256 + p[9];
-		if (p+len > r+rlen) return -1;
-		if (callback(ctx, p[1], p+10, len, r) < 0) return -1;
+		if (len+10 > r+rlen-p) return -1;
+		if (callback(ctx, p[1], p+10, len, r, rlen) < 0) return -1;
 		p += 10 + len;
 	}
 	return 0;
diff --git a/src/network/gai_strerror.c b/src/network/gai_strerror.c
index 9596580e..56b71503 100644
--- a/src/network/gai_strerror.c
+++ b/src/network/gai_strerror.c
@@ -6,7 +6,7 @@ static const char msgs[] =
 	"Name does not resolve\0"
 	"Try again\0"
 	"Non-recoverable error\0"
-	"Unknown error\0"
+	"Name has no usable address\0"
 	"Unrecognized address family or invalid length\0"
 	"Unrecognized socket type\0"
 	"Unrecognized service\0"
diff --git a/src/network/getaddrinfo.c b/src/network/getaddrinfo.c
index efaab306..64ad259a 100644
--- a/src/network/getaddrinfo.c
+++ b/src/network/getaddrinfo.c
@@ -16,6 +16,7 @@ int getaddrinfo(const char *restrict host, const char *restrict serv, const stru
 	char canon[256], *outcanon;
 	int nservs, naddrs, nais, canon_len, i, j, k;
 	int family = AF_UNSPEC, flags = 0, proto = 0, socktype = 0;
+	int no_family = 0;
 	struct aibuf *out;
 
 	if (!host && !serv) return EAI_NONAME;
@@ -66,9 +67,11 @@ int getaddrinfo(const char *restrict host, const char *restrict serv, const stru
 				pthread_setcancelstate(
 					PTHREAD_CANCEL_DISABLE, &cs);
 				int r = connect(s, ta[i], tl[i]);
+				int saved_errno = errno;
 				pthread_setcancelstate(cs, 0);
 				close(s);
 				if (!r) continue;
+				errno = saved_errno;
 			}
 			switch (errno) {
 			case EADDRNOTAVAIL:
@@ -80,7 +83,7 @@ int getaddrinfo(const char *restrict host, const char *restrict serv, const stru
 			default:
 				return EAI_SYSTEM;
 			}
-			if (family == tf[i]) return EAI_NONAME;
+			if (family == tf[i]) no_family = 1;
 			family = tf[1-i];
 		}
 	}
@@ -91,6 +94,8 @@ int getaddrinfo(const char *restrict host, const char *restrict serv, const stru
 	naddrs = __lookup_name(addrs, canon, host, family, flags);
 	if (naddrs < 0) return naddrs;
 
+	if (no_family) return EAI_NODATA;
+
 	nais = nservs * naddrs;
 	canon_len = strlen(canon);
 	out = calloc(1, nais * sizeof(*out) + canon_len + 1);
diff --git a/src/network/gethostbyaddr.c b/src/network/gethostbyaddr.c
index 598e2241..c3cacaac 100644
--- a/src/network/gethostbyaddr.c
+++ b/src/network/gethostbyaddr.c
@@ -20,5 +20,5 @@ struct hostent *gethostbyaddr(const void *a, socklen_t l, int af)
 		err = gethostbyaddr_r(a, l, af, h,
 			(void *)(h+1), size-sizeof *h, &res, &h_errno);
 	} while (err == ERANGE);
-	return err ? 0 : h;
+	return res;
 }
diff --git a/src/network/gethostbyaddr_r.c b/src/network/gethostbyaddr_r.c
index 0f1e61aa..ceaf3935 100644
--- a/src/network/gethostbyaddr_r.c
+++ b/src/network/gethostbyaddr_r.c
@@ -54,10 +54,11 @@ int gethostbyaddr_r(const void *a, socklen_t l, int af,
 	case EAI_OVERFLOW:
 		return ERANGE;
 	default:
-	case EAI_MEMORY:
-	case EAI_SYSTEM:
 	case EAI_FAIL:
 		*err = NO_RECOVERY;
+		return EBADMSG;
+	case EAI_SYSTEM:
+		*err = NO_RECOVERY;
 		return errno;
 	case 0:
 		break;
diff --git a/src/network/gethostbyname2.c b/src/network/gethostbyname2.c
index dc9d6621..bd0da7f8 100644
--- a/src/network/gethostbyname2.c
+++ b/src/network/gethostbyname2.c
@@ -21,5 +21,5 @@ struct hostent *gethostbyname2(const char *name, int af)
 		err = gethostbyname2_r(name, af, h,
 			(void *)(h+1), size-sizeof *h, &res, &h_errno);
 	} while (err == ERANGE);
-	return err ? 0 : h;
+	return res;
 }
diff --git a/src/network/gethostbyname2_r.c b/src/network/gethostbyname2_r.c
index fc894877..a5eb67fe 100644
--- a/src/network/gethostbyname2_r.c
+++ b/src/network/gethostbyname2_r.c
@@ -22,7 +22,10 @@ int gethostbyname2_r(const char *name, int af,
 	if (cnt<0) switch (cnt) {
 	case EAI_NONAME:
 		*err = HOST_NOT_FOUND;
-		return ENOENT;
+		return 0;
+	case EAI_NODATA:
+		*err = NO_DATA;
+		return 0;
 	case EAI_AGAIN:
 		*err = TRY_AGAIN;
 		return EAGAIN;
@@ -30,7 +33,6 @@ int gethostbyname2_r(const char *name, int af,
 	case EAI_FAIL:
 		*err = NO_RECOVERY;
 		return EBADMSG;
-	case EAI_MEMORY:
 	case EAI_SYSTEM:
 		*err = NO_RECOVERY;
 		return errno;
diff --git a/src/network/getifaddrs.c b/src/network/getifaddrs.c
index fed75bd8..74df4d6c 100644
--- a/src/network/getifaddrs.c
+++ b/src/network/getifaddrs.c
@@ -39,8 +39,8 @@ struct ifaddrs_storage {
 };
 
 struct ifaddrs_ctx {
-	struct ifaddrs_storage *first;
-	struct ifaddrs_storage *last;
+	struct ifaddrs *first;
+	struct ifaddrs *last;
 	struct ifaddrs_storage *hash[IFADDRS_HASH_SIZE];
 };
 
@@ -195,9 +195,9 @@ static int netlink_msg_to_ifaddr(void *pctx, struct nlmsghdr *h)
 	}
 
 	if (ifs->ifa.ifa_name) {
-		if (!ctx->first) ctx->first = ifs;
-		if (ctx->last) ctx->last->ifa.ifa_next = &ifs->ifa;
-		ctx->last = ifs;
+		if (!ctx->first) ctx->first = &ifs->ifa;
+		if (ctx->last) ctx->last->ifa_next = &ifs->ifa;
+		ctx->last = &ifs->ifa;
 	} else {
 		free(ifs);
 	}
@@ -210,7 +210,7 @@ int getifaddrs(struct ifaddrs **ifap)
 	int r;
 	memset(ctx, 0, sizeof *ctx);
 	r = __rtnetlink_enumerate(AF_UNSPEC, AF_UNSPEC, netlink_msg_to_ifaddr, ctx);
-	if (r == 0) *ifap = &ctx->first->ifa;
-	else freeifaddrs(&ctx->first->ifa);
+	if (r == 0) *ifap = ctx->first;
+	else freeifaddrs(ctx->first);
 	return r;
 }
diff --git a/src/network/getnameinfo.c b/src/network/getnameinfo.c
index f77e73ad..133c15b3 100644
--- a/src/network/getnameinfo.c
+++ b/src/network/getnameinfo.c
@@ -58,6 +58,7 @@ static void reverse_hosts(char *buf, const unsigned char *a, unsigned scopeid, i
 		if ((p=strchr(line, '#'))) *p++='\n', *p=0;
 
 		for (p=line; *p && !isspace(*p); p++);
+		if (!*p) continue;
 		*p++ = 0;
 		if (__lookup_ipliteral(&iplit, line, AF_UNSPEC)<=0)
 			continue;
@@ -108,10 +109,10 @@ static void reverse_services(char *buf, int port, int dgram)
 	__fclose_ca(f);
 }
 
-static int dns_parse_callback(void *c, int rr, const void *data, int len, const void *packet)
+static int dns_parse_callback(void *c, int rr, const void *data, int len, const void *packet, int plen)
 {
 	if (rr != RR_PTR) return 0;
-	if (__dn_expand(packet, (const unsigned char *)packet + 512,
+	if (__dn_expand(packet, (const unsigned char *)packet + plen,
 	    data, c, 256) <= 0)
 		*(char *)c = 0;
 	return 0;
@@ -158,10 +159,13 @@ int getnameinfo(const struct sockaddr *restrict sa, socklen_t sl,
 			unsigned char query[18+PTR_MAX], reply[512];
 			int qlen = __res_mkquery(0, ptr, 1, RR_PTR,
 				0, 0, 0, query, sizeof query);
+			query[3] = 0; /* don't need AD flag */
 			int rlen = __res_send(query, qlen, reply, sizeof reply);
 			buf[0] = 0;
-			if (rlen > 0)
+			if (rlen > 0) {
+				if (rlen > sizeof reply) rlen = sizeof reply;
 				__dns_parse(reply, rlen, dns_parse_callback, buf);
+			}
 		}
 		if (!*buf) {
 			if (flags & NI_NAMEREQD) return EAI_NONAME;
diff --git a/src/network/getservbyport_r.c b/src/network/getservbyport_r.c
index b7f21c6b..e4cc3079 100644
--- a/src/network/getservbyport_r.c
+++ b/src/network/getservbyport_r.c
@@ -26,7 +26,7 @@ int getservbyport_r(int port, const char *prots,
 	/* Align buffer */
 	i = (uintptr_t)buf & sizeof(char *)-1;
 	if (!i) i = sizeof(char *);
-	if (buflen < 3*sizeof(char *)-i)
+	if (buflen <= 3*sizeof(char *)-i)
 		return ERANGE;
 	buf += sizeof(char *)-i;
 	buflen -= sizeof(char *)-i;
@@ -46,6 +46,8 @@ int getservbyport_r(int port, const char *prots,
 	case EAI_MEMORY:
 	case EAI_SYSTEM:
 		return ENOMEM;
+	case EAI_OVERFLOW:
+		return ERANGE;
 	default:
 		return ENOENT;
 	case 0:
diff --git a/src/network/h_errno.c b/src/network/h_errno.c
index 4f700cea..638f7718 100644
--- a/src/network/h_errno.c
+++ b/src/network/h_errno.c
@@ -1,9 +1,11 @@
 #include <netdb.h>
+#include "pthread_impl.h"
 
 #undef h_errno
 int h_errno;
 
 int *__h_errno_location(void)
 {
-	return &h_errno;
+	if (!__pthread_self()->stack) return &h_errno;
+	return &__pthread_self()->h_errno_val;
 }
diff --git a/src/network/herror.c b/src/network/herror.c
index 65f25ff3..87f8cff4 100644
--- a/src/network/herror.c
+++ b/src/network/herror.c
@@ -4,5 +4,5 @@
 
 void herror(const char *msg)
 {
-	fprintf(stderr, "%s%s%s", msg?msg:"", msg?": ":"", hstrerror(h_errno));
+	fprintf(stderr, "%s%s%s\n", msg?msg:"", msg?": ":"", hstrerror(h_errno));
 }
diff --git a/src/network/inet_ntop.c b/src/network/inet_ntop.c
index 4bfef2c5..f442f47d 100644
--- a/src/network/inet_ntop.c
+++ b/src/network/inet_ntop.c
@@ -34,7 +34,12 @@ const char *inet_ntop(int af, const void *restrict a0, char *restrict s, socklen
 		for (i=best=0, max=2; buf[i]; i++) {
 			if (i && buf[i] != ':') continue;
 			j = strspn(buf+i, ":0");
-			if (j>max) best=i, max=j;
+			/* The leading sequence of zeros (best==0) is
+			 * disadvantaged compared to sequences elsewhere
+			 * as it doesn't have a leading colon. One extra
+			 * character is required for another sequence to
+			 * beat it fairly. */
+			if (j>max+(best==0)) best=i, max=j;
 		}
 		if (max>3) {
 			buf[best] = buf[best+1] = ':';
diff --git a/src/network/inet_pton.c b/src/network/inet_pton.c
index d36c3689..bcbdd9ef 100644
--- a/src/network/inet_pton.c
+++ b/src/network/inet_pton.c
@@ -54,6 +54,7 @@ int inet_pton(int af, const char *restrict s, void *restrict a0)
 			if (s[j]!='.' || (i<6 && brk<0)) return 0;
 			need_v4=1;
 			i++;
+			ip[i&7]=0;
 			break;
 		}
 		s += j+1;
diff --git a/src/network/lookup.h b/src/network/lookup.h
index ef662725..54b2f8b5 100644
--- a/src/network/lookup.h
+++ b/src/network/lookup.h
@@ -50,6 +50,6 @@ hidden int __lookup_ipliteral(struct address buf[static 1], const char *name, in
 hidden int __get_resolv_conf(struct resolvconf *, char *, size_t);
 hidden int __res_msend_rc(int, const unsigned char *const *, const int *, unsigned char *const *, int *, int, const struct resolvconf *);
 
-hidden int __dns_parse(const unsigned char *, int, int (*)(void *, int, const void *, int, const void *), void *);
+hidden int __dns_parse(const unsigned char *, int, int (*)(void *, int, const void *, int, const void *, int), void *);
 
 #endif
diff --git a/src/network/lookup_ipliteral.c b/src/network/lookup_ipliteral.c
index 2fddab73..1e766206 100644
--- a/src/network/lookup_ipliteral.c
+++ b/src/network/lookup_ipliteral.c
@@ -15,7 +15,7 @@ int __lookup_ipliteral(struct address buf[static 1], const char *name, int famil
 	struct in6_addr a6;
 	if (__inet_aton(name, &a4) > 0) {
 		if (family == AF_INET6) /* wrong family */
-			return EAI_NONAME;
+			return EAI_NODATA;
 		memcpy(&buf[0].addr, &a4, sizeof a4);
 		buf[0].family = AF_INET;
 		buf[0].scopeid = 0;
@@ -34,7 +34,7 @@ int __lookup_ipliteral(struct address buf[static 1], const char *name, int famil
 	if (inet_pton(AF_INET6, name, &a6) <= 0)
 		return 0;
 	if (family == AF_INET) /* wrong family */
-		return EAI_NONAME;
+		return EAI_NODATA;
 
 	memcpy(&buf[0].addr, &a6, sizeof a6);
 	buf[0].family = AF_INET6;
diff --git a/src/network/lookup_name.c b/src/network/lookup_name.c
index c93263a9..35218185 100644
--- a/src/network/lookup_name.c
+++ b/src/network/lookup_name.c
@@ -50,7 +50,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 {
 	char line[512];
 	size_t l = strlen(name);
-	int cnt = 0, badfam = 0;
+	int cnt = 0, badfam = 0, have_canon = 0;
 	unsigned char _buf[1032];
 	FILE _f, *f = __fopen_rb_ca("/etc/hosts", &_f, _buf, sizeof _buf);
 	if (!f) switch (errno) {
@@ -79,15 +79,20 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 		case 0:
 			continue;
 		default:
-			badfam = EAI_NONAME;
-			continue;
+			badfam = EAI_NODATA;
+			break;
 		}
 
+		if (have_canon) continue;
+
 		/* Extract first name as canonical name */
 		for (; *p && isspace(*p); p++);
 		for (z=p; *z && !isspace(*z); z++);
 		*z = 0;
-		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
+		if (is_valid_hostname(p)) {
+			have_canon = 1;
+			memcpy(canon, p, z-p+1);
+		}
 	}
 	__fclose_ca(f);
 	return cnt ? cnt : badfam;
@@ -97,45 +102,50 @@ struct dpc_ctx {
 	struct address *addrs;
 	char *canon;
 	int cnt;
+	int rrtype;
 };
 
 #define RR_A 1
 #define RR_CNAME 5
 #define RR_AAAA 28
 
-static int dns_parse_callback(void *c, int rr, const void *data, int len, const void *packet)
+#define ABUF_SIZE 4800
+
+static int dns_parse_callback(void *c, int rr, const void *data, int len, const void *packet, int plen)
 {
 	char tmp[256];
+	int family;
 	struct dpc_ctx *ctx = c;
-	if (ctx->cnt >= MAXADDRS) return -1;
+	if (rr == RR_CNAME) {
+		if (__dn_expand(packet, (const unsigned char *)packet + plen,
+		    data, tmp, sizeof tmp) > 0 && is_valid_hostname(tmp))
+			strcpy(ctx->canon, tmp);
+		return 0;
+	}
+	if (ctx->cnt >= MAXADDRS) return 0;
+	if (rr != ctx->rrtype) return 0;
 	switch (rr) {
 	case RR_A:
 		if (len != 4) return -1;
-		ctx->addrs[ctx->cnt].family = AF_INET;
-		ctx->addrs[ctx->cnt].scopeid = 0;
-		memcpy(ctx->addrs[ctx->cnt++].addr, data, 4);
+		family = AF_INET;
 		break;
 	case RR_AAAA:
 		if (len != 16) return -1;
-		ctx->addrs[ctx->cnt].family = AF_INET6;
-		ctx->addrs[ctx->cnt].scopeid = 0;
-		memcpy(ctx->addrs[ctx->cnt++].addr, data, 16);
-		break;
-	case RR_CNAME:
-		if (__dn_expand(packet, (const unsigned char *)packet + 512,
-		    data, tmp, sizeof tmp) > 0 && is_valid_hostname(tmp))
-			strcpy(ctx->canon, tmp);
+		family = AF_INET6;
 		break;
 	}
+	ctx->addrs[ctx->cnt].family = family;
+	ctx->addrs[ctx->cnt].scopeid = 0;
+	memcpy(ctx->addrs[ctx->cnt++].addr, data, len);
 	return 0;
 }
 
 static int name_from_dns(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, const struct resolvconf *conf)
 {
-	unsigned char qbuf[2][280], abuf[2][512];
+	unsigned char qbuf[2][280], abuf[2][ABUF_SIZE];
 	const unsigned char *qp[2] = { qbuf[0], qbuf[1] };
 	unsigned char *ap[2] = { abuf[0], abuf[1] };
-	int qlens[2], alens[2];
+	int qlens[2], alens[2], qtypes[2];
 	int i, nq = 0;
 	struct dpc_ctx ctx = { .addrs = buf, .canon = canon };
 	static const struct { int af; int rr; } afrr[2] = {
@@ -148,7 +158,12 @@ static int name_from_dns(struct address buf[static MAXADDRS], char canon[static
 			qlens[nq] = __res_mkquery(0, name, 1, afrr[i].rr,
 				0, 0, 0, qbuf[nq], sizeof *qbuf);
 			if (qlens[nq] == -1)
-				return EAI_NONAME;
+				return 0;
+			qtypes[nq] = afrr[i].rr;
+			qbuf[nq][3] = 0; /* don't need AD flag */
+			/* Ensure query IDs are distinct. */
+			if (nq && qbuf[nq][0] == qbuf[0][0])
+				qbuf[nq][0]++;
 			nq++;
 		}
 	}
@@ -156,14 +171,20 @@ static int name_from_dns(struct address buf[static MAXADDRS], char canon[static
 	if (__res_msend_rc(nq, qp, qlens, ap, alens, sizeof *abuf, conf) < 0)
 		return EAI_SYSTEM;
 
-	for (i=0; i<nq; i++)
+	for (i=0; i<nq; i++) {
+		if (alens[i] < 4 || (abuf[i][3] & 15) == 2) return EAI_AGAIN;
+		if ((abuf[i][3] & 15) == 3) return 0;
+		if ((abuf[i][3] & 15) != 0) return EAI_FAIL;
+	}
+
+	for (i=nq-1; i>=0; i--) {
+		ctx.rrtype = qtypes[i];
+		if (alens[i] > sizeof(abuf[i])) alens[i] = sizeof abuf[i];
 		__dns_parse(abuf[i], alens[i], dns_parse_callback, &ctx);
+	}
 
 	if (ctx.cnt) return ctx.cnt;
-	if (alens[0] < 4 || (abuf[0][3] & 15) == 2) return EAI_AGAIN;
-	if ((abuf[0][3] & 15) == 0) return EAI_NONAME;
-	if ((abuf[0][3] & 15) == 3) return 0;
-	return EAI_FAIL;
+	return EAI_NODATA;
 }
 
 static int name_from_dns_search(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family)
diff --git a/src/network/netlink.h b/src/network/netlink.h
index 38acb178..873fabe2 100644
--- a/src/network/netlink.h
+++ b/src/network/netlink.h
@@ -86,7 +86,7 @@ struct ifaddrmsg {
 #define RTA_DATALEN(rta)	((rta)->rta_len-sizeof(struct rtattr))
 #define RTA_DATAEND(rta)	((char*)(rta)+(rta)->rta_len)
 #define RTA_NEXT(rta)		(struct rtattr*)((char*)(rta)+NETLINK_ALIGN((rta)->rta_len))
-#define RTA_OK(nlh,end)		((char*)(end)-(char*)(rta) >= sizeof(struct rtattr))
+#define RTA_OK(rta,end)		((char*)(end)-(char*)(rta) >= sizeof(struct rtattr))
 
 #define NLMSG_RTA(nlh,len)	((void*)((char*)(nlh)+sizeof(struct nlmsghdr)+NETLINK_ALIGN(len)))
 #define NLMSG_RTAOK(rta,nlh)	RTA_OK(rta,NLMSG_DATAEND(nlh))
diff --git a/src/network/res_mkquery.c b/src/network/res_mkquery.c
index 6fa04a5c..614bf786 100644
--- a/src/network/res_mkquery.c
+++ b/src/network/res_mkquery.c
@@ -13,6 +13,7 @@ int __res_mkquery(int op, const char *dname, int class, int type,
 	int n;
 
 	if (l && dname[l-1]=='.') l--;
+	if (l && dname[l-1]=='.') return -1;
 	n = 17+l+!!l;
 	if (l>253 || buflen<n || op>15u || class>255u || type>255u)
 		return -1;
@@ -20,6 +21,7 @@ int __res_mkquery(int op, const char *dname, int class, int type,
 	/* Construct query template - ID will be filled later */
 	memset(q, 0, n);
 	q[2] = op*8 + 1;
+	q[3] = 32; /* AD */
 	q[5] = 1;
 	memcpy((char *)q+13, dname, l);
 	for (i=13; q[i]; i=j+1) {
diff --git a/src/network/res_msend.c b/src/network/res_msend.c
index 3e018009..fcb52513 100644
--- a/src/network/res_msend.c
+++ b/src/network/res_msend.c
@@ -1,5 +1,6 @@
 #include <sys/socket.h>
 #include <netinet/in.h>
+#include <netinet/tcp.h>
 #include <netdb.h>
 #include <arpa/inet.h>
 #include <stdint.h>
@@ -16,17 +17,65 @@
 
 static void cleanup(void *p)
 {
-	__syscall(SYS_close, (intptr_t)p);
+	struct pollfd *pfd = p;
+	for (int i=0; pfd[i].fd >= -1; i++)
+		if (pfd[i].fd >= 0) __syscall(SYS_close, pfd[i].fd);
 }
 
 static unsigned long mtime()
 {
 	struct timespec ts;
-	clock_gettime(CLOCK_REALTIME, &ts);
+	if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0 && errno == ENOSYS)
+		clock_gettime(CLOCK_REALTIME, &ts);
 	return (unsigned long)ts.tv_sec * 1000
 		+ ts.tv_nsec / 1000000;
 }
 
+static int start_tcp(struct pollfd *pfd, int family, const void *sa, socklen_t sl, const unsigned char *q, int ql)
+{
+	struct msghdr mh = {
+		.msg_name = (void *)sa,
+		.msg_namelen = sl,
+		.msg_iovlen = 2,
+		.msg_iov = (struct iovec [2]){
+			{ .iov_base = (uint8_t[]){ ql>>8, ql }, .iov_len = 2 },
+			{ .iov_base = (void *)q, .iov_len = ql } }
+	};
+	int r;
+	int fd = socket(family, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+	pfd->fd = fd;
+	pfd->events = POLLOUT;
+	if (!setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN_CONNECT,
+	    &(int){1}, sizeof(int))) {
+		r = sendmsg(fd, &mh, MSG_FASTOPEN|MSG_NOSIGNAL);
+		if (r == ql+2) pfd->events = POLLIN;
+		if (r >= 0) return r;
+		if (errno == EINPROGRESS) return 0;
+	}
+	r = connect(fd, sa, sl);
+	if (!r || errno == EINPROGRESS) return 0;
+	close(fd);
+	pfd->fd = -1;
+	return -1;
+}
+
+static void step_mh(struct msghdr *mh, size_t n)
+{
+	/* Adjust iovec in msghdr to skip first n bytes. */
+	while (mh->msg_iovlen && n >= mh->msg_iov->iov_len) {
+		n -= mh->msg_iov->iov_len;
+		mh->msg_iov++;
+		mh->msg_iovlen--;
+	}
+	if (!mh->msg_iovlen) return;
+	mh->msg_iov->iov_base = (char *)mh->msg_iov->iov_base + n;
+	mh->msg_iov->iov_len -= n;
+}
+
+/* Internal contract for __res_msend[_rc]: asize must be >=512, nqueries
+ * must be sufficiently small to be safe as VLA size. In practice it's
+ * either 1 or 2, anyway. */
+
 int __res_msend_rc(int nqueries, const unsigned char *const *queries,
 	const int *qlens, unsigned char *const *answers, int *alens, int asize,
 	const struct resolvconf *conf)
@@ -34,8 +83,8 @@ int __res_msend_rc(int nqueries, const unsigned char *const *queries,
 	int fd;
 	int timeout, attempts, retry_interval, servfail_retry;
 	union {
-		struct sockaddr_in sin;
 		struct sockaddr_in6 sin6;
+		struct sockaddr_in sin;
 	} sa = {0}, ns[MAXNS] = {{0}};
 	socklen_t sl = sizeof sa.sin;
 	int nns = 0;
@@ -44,7 +93,10 @@ int __res_msend_rc(int nqueries, const unsigned char *const *queries,
 	int next;
 	int i, j;
 	int cs;
-	struct pollfd pfd;
+	struct pollfd pfd[nqueries+2];
+	int qpos[nqueries], apos[nqueries];
+	unsigned char alen_buf[nqueries][2];
+	int r;
 	unsigned long t0, t1, t2;
 
 	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
@@ -68,29 +120,22 @@ int __res_msend_rc(int nqueries, const unsigned char *const *queries,
 	}
 
 	/* Get local address and open/bind a socket */
-	sa.sin.sin_family = family;
 	fd = socket(family, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
 
 	/* Handle case where system lacks IPv6 support */
 	if (fd < 0 && family == AF_INET6 && errno == EAFNOSUPPORT) {
+		for (i=0; i<nns && conf->ns[nns].family == AF_INET6; i++);
+		if (i==nns) {
+			pthread_setcancelstate(cs, 0);
+			return -1;
+		}
 		fd = socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
 		family = AF_INET;
+		sl = sizeof sa.sin;
 	}
-	if (fd < 0 || bind(fd, (void *)&sa, sl) < 0) {
-		if (fd >= 0) close(fd);
-		pthread_setcancelstate(cs, 0);
-		return -1;
-	}
-
-	/* Past this point, there are no errors. Each individual query will
-	 * yield either no reply (indicated by zero length) or an answer
-	 * packet which is up to the caller to interpret. */
-
-	pthread_cleanup_push(cleanup, (void *)(intptr_t)fd);
-	pthread_setcancelstate(cs, 0);
 
 	/* Convert any IPv4 addresses in a mixed environment to v4-mapped */
-	if (family == AF_INET6) {
+	if (fd >= 0 && family == AF_INET6) {
 		setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &(int){0}, sizeof 0);
 		for (i=0; i<nns; i++) {
 			if (ns[i].sin.sin_family != AF_INET) continue;
@@ -104,16 +149,38 @@ int __res_msend_rc(int nqueries, const unsigned char *const *queries,
 		}
 	}
 
+	sa.sin.sin_family = family;
+	if (fd < 0 || bind(fd, (void *)&sa, sl) < 0) {
+		if (fd >= 0) close(fd);
+		pthread_setcancelstate(cs, 0);
+		return -1;
+	}
+
+	/* Past this point, there are no errors. Each individual query will
+	 * yield either no reply (indicated by zero length) or an answer
+	 * packet which is up to the caller to interpret. */
+
+	for (i=0; i<nqueries; i++) pfd[i].fd = -1;
+	pfd[nqueries].fd = fd;
+	pfd[nqueries].events = POLLIN;
+	pfd[nqueries+1].fd = -2;
+
+	pthread_cleanup_push(cleanup, pfd);
+	pthread_setcancelstate(cs, 0);
+
 	memset(alens, 0, sizeof *alens * nqueries);
 
-	pfd.fd = fd;
-	pfd.events = POLLIN;
 	retry_interval = timeout / attempts;
 	next = 0;
 	t0 = t2 = mtime();
 	t1 = t2 - retry_interval;
 
 	for (; t2-t0 < timeout; t2=mtime()) {
+		/* This is the loop exit condition: that all queries
+		 * have an accepted answer. */
+		for (i=0; i<nqueries && alens[i]>0; i++);
+		if (i==nqueries) break;
+
 		if (t2-t1 >= retry_interval) {
 			/* Query all configured namservers in parallel */
 			for (i=0; i<nqueries; i++)
@@ -127,10 +194,20 @@ int __res_msend_rc(int nqueries, const unsigned char *const *queries,
 		}
 
 		/* Wait for a response, or until time to retry */
-		if (poll(&pfd, 1, t1+retry_interval-t2) <= 0) continue;
+		if (poll(pfd, nqueries+1, t1+retry_interval-t2) <= 0) continue;
 
-		while ((rlen = recvfrom(fd, answers[next], asize, 0,
-		  (void *)&sa, (socklen_t[1]){sl})) >= 0) {
+		while (next < nqueries) {
+			struct msghdr mh = {
+				.msg_name = (void *)&sa,
+				.msg_namelen = sl,
+				.msg_iovlen = 1,
+				.msg_iov = (struct iovec []){
+					{ .iov_base = (void *)answers[next],
+					  .iov_len = asize }
+				}
+			};
+			rlen = recvmsg(fd, &mh, 0);
+			if (rlen < 0) break;
 
 			/* Ignore non-identifiable packets */
 			if (rlen < 4) continue;
@@ -170,12 +247,72 @@ int __res_msend_rc(int nqueries, const unsigned char *const *queries,
 			else
 				memcpy(answers[i], answers[next], rlen);
 
-			if (next == nqueries) goto out;
+			/* Ignore further UDP if all slots full or TCP-mode */
+			if (next == nqueries) pfd[nqueries].events = 0;
+
+			/* If answer is truncated (TC bit), fallback to TCP */
+			if ((answers[i][2] & 2) || (mh.msg_flags & MSG_TRUNC)) {
+				alens[i] = -1;
+				pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);
+				r = start_tcp(pfd+i, family, ns+j, sl, queries[i], qlens[i]);
+				pthread_setcancelstate(cs, 0);
+				if (r >= 0) {
+					qpos[i] = r;
+					apos[i] = 0;
+				}
+				continue;
+			}
+		}
+
+		for (i=0; i<nqueries; i++) if (pfd[i].revents & POLLOUT) {
+			struct msghdr mh = {
+				.msg_iovlen = 2,
+				.msg_iov = (struct iovec [2]){
+					{ .iov_base = (uint8_t[]){ qlens[i]>>8, qlens[i] }, .iov_len = 2 },
+					{ .iov_base = (void *)queries[i], .iov_len = qlens[i] } }
+			};
+			step_mh(&mh, qpos[i]);
+			r = sendmsg(pfd[i].fd, &mh, MSG_NOSIGNAL);
+			if (r < 0) goto out;
+			qpos[i] += r;
+			if (qpos[i] == qlens[i]+2)
+				pfd[i].events = POLLIN;
+		}
+
+		for (i=0; i<nqueries; i++) if (pfd[i].revents & POLLIN) {
+			struct msghdr mh = {
+				.msg_iovlen = 2,
+				.msg_iov = (struct iovec [2]){
+					{ .iov_base = alen_buf[i], .iov_len = 2 },
+					{ .iov_base = answers[i], .iov_len = asize } }
+			};
+			step_mh(&mh, apos[i]);
+			r = recvmsg(pfd[i].fd, &mh, 0);
+			if (r <= 0) goto out;
+			apos[i] += r;
+			if (apos[i] < 2) continue;
+			int alen = alen_buf[i][0]*256 + alen_buf[i][1];
+			if (alen < 13) goto out;
+			if (apos[i] < alen+2 && apos[i] < asize+2)
+				continue;
+			int rcode = answers[i][3] & 15;
+			if (rcode != 0 && rcode != 3)
+				goto out;
+
+			/* Storing the length here commits the accepted answer.
+			 * Immediately close TCP socket so as not to consume
+			 * resources we no longer need. */
+			alens[i] = alen;
+			__syscall(SYS_close, pfd[i].fd);
+			pfd[i].fd = -1;
 		}
 	}
 out:
 	pthread_cleanup_pop(1);
 
+	/* Disregard any incomplete TCP results */
+	for (i=0; i<nqueries; i++) if (alens[i]<0) alens[i] = 0;
+
 	return 0;
 }
 
diff --git a/src/network/res_query.c b/src/network/res_query.c
index 2f4da2e2..506dc231 100644
--- a/src/network/res_query.c
+++ b/src/network/res_query.c
@@ -1,3 +1,4 @@
+#define _BSD_SOURCE
 #include <resolv.h>
 #include <netdb.h>
 
@@ -6,7 +7,20 @@ int res_query(const char *name, int class, int type, unsigned char *dest, int le
 	unsigned char q[280];
 	int ql = __res_mkquery(0, name, class, type, 0, 0, 0, q, sizeof q);
 	if (ql < 0) return ql;
-	return __res_send(q, ql, dest, len);
+	int r = __res_send(q, ql, dest, len);
+	if (r<12) {
+		h_errno = TRY_AGAIN;
+		return -1;
+	}
+	if ((dest[3] & 15) == 3) {
+		h_errno = HOST_NOT_FOUND;
+		return -1;
+	}
+	if ((dest[3] & 15) == 0 && !dest[6] && !dest[7]) {
+		h_errno = NO_DATA;
+		return -1;
+	}
+	return r;
 }
 
 weak_alias(res_query, res_search);
diff --git a/src/network/res_send.c b/src/network/res_send.c
index b9cea0bf..9593164d 100644
--- a/src/network/res_send.c
+++ b/src/network/res_send.c
@@ -1,9 +1,17 @@
 #include <resolv.h>
+#include <string.h>
 
 int __res_send(const unsigned char *msg, int msglen, unsigned char *answer, int anslen)
 {
-	int r = __res_msend(1, &msg, &msglen, &answer, &anslen, anslen);
-	return r<0 ? r : anslen;
+	int r;
+	if (anslen < 512) {
+		unsigned char buf[512];
+		r = __res_send(msg, msglen, buf, sizeof buf);
+		if (r >= 0) memcpy(answer, buf, r < anslen ? r : anslen);
+		return r;
+	}
+	r = __res_msend(1, &msg, &msglen, &answer, &anslen, anslen);
+	return r<0 || !anslen ? -1 : anslen;
 }
 
 weak_alias(__res_send, res_send);
diff --git a/src/network/sendmsg.c b/src/network/sendmsg.c
index 80cc5f41..acdfdf29 100644
--- a/src/network/sendmsg.c
+++ b/src/network/sendmsg.c
@@ -8,13 +8,16 @@ ssize_t sendmsg(int fd, const struct msghdr *msg, int flags)
 {
 #if LONG_MAX > INT_MAX
 	struct msghdr h;
-	struct cmsghdr chbuf[1024/sizeof(struct cmsghdr)+1], *c;
+	/* Kernels before 2.6.38 set SCM_MAX_FD to 255, allocate enough
+	 * space to support an SCM_RIGHTS ancillary message with 255 fds.
+	 * Kernels since 2.6.38 set SCM_MAX_FD to 253. */
+	struct cmsghdr chbuf[CMSG_SPACE(255*sizeof(int))/sizeof(struct cmsghdr)+1], *c;
 	if (msg) {
 		h = *msg;
 		h.__pad1 = h.__pad2 = 0;
 		msg = &h;
 		if (h.msg_controllen) {
-			if (h.msg_controllen > 1024) {
+			if (h.msg_controllen > sizeof chbuf) {
 				errno = ENOMEM;
 				return -1;
 			}
diff --git a/src/network/socket.c b/src/network/socket.c
index a2e92d90..afa1a7f3 100644
--- a/src/network/socket.c
+++ b/src/network/socket.c
@@ -5,17 +5,17 @@
 
 int socket(int domain, int type, int protocol)
 {
-	int s = socketcall(socket, domain, type, protocol, 0, 0, 0);
-	if (s<0 && (errno==EINVAL || errno==EPROTONOSUPPORT)
+	int s = __socketcall(socket, domain, type, protocol, 0, 0, 0);
+	if ((s==-EINVAL || s==-EPROTONOSUPPORT)
 	    && (type&(SOCK_CLOEXEC|SOCK_NONBLOCK))) {
-		s = socketcall(socket, domain,
+		s = __socketcall(socket, domain,
 			type & ~(SOCK_CLOEXEC|SOCK_NONBLOCK),
 			protocol, 0, 0, 0);
-		if (s < 0) return s;
+		if (s < 0) return __syscall_ret(s);
 		if (type & SOCK_CLOEXEC)
 			__syscall(SYS_fcntl, s, F_SETFD, FD_CLOEXEC);
 		if (type & SOCK_NONBLOCK)
 			__syscall(SYS_fcntl, s, F_SETFL, O_NONBLOCK);
 	}
-	return s;
+	return __syscall_ret(s);
 }
diff --git a/src/passwd/getgrouplist.c b/src/passwd/getgrouplist.c
index 43e51824..301824ce 100644
--- a/src/passwd/getgrouplist.c
+++ b/src/passwd/getgrouplist.c
@@ -31,7 +31,8 @@ int getgrouplist(const char *user, gid_t gid, gid_t *groups, int *ngroups)
 	if (resp[INITGRFOUND]) {
 		nscdbuf = calloc(resp[INITGRNGRPS], sizeof(uint32_t));
 		if (!nscdbuf) goto cleanup;
-		if (!fread(nscdbuf, sizeof(*nscdbuf)*resp[INITGRNGRPS], 1, f)) {
+		size_t nbytes = sizeof(*nscdbuf)*resp[INITGRNGRPS];
+		if (nbytes && !fread(nscdbuf, nbytes, 1, f)) {
 			if (!ferror(f)) errno = EIO;
 			goto cleanup;
 		}
diff --git a/src/passwd/nscd_query.c b/src/passwd/nscd_query.c
index d38e371b..dc3406b8 100644
--- a/src/passwd/nscd_query.c
+++ b/src/passwd/nscd_query.c
@@ -40,7 +40,15 @@ retry:
 	buf[0] = NSCDVERSION;
 
 	fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
-	if (fd < 0) return NULL;
+	if (fd < 0) {
+		if (errno == EAFNOSUPPORT) {
+			f = fopen("/dev/null", "re");
+			if (f)
+				errno = errno_save;
+			return f;
+		}
+		return 0;
+	}
 
 	if(!(f = fdopen(fd, "r"))) {
 		close(fd);
diff --git a/src/prng/random.c b/src/prng/random.c
index 633a17f6..d3780fa7 100644
--- a/src/prng/random.c
+++ b/src/prng/random.c
@@ -1,6 +1,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include "lock.h"
+#include "fork_impl.h"
 
 /*
 this code uses the same lagged fibonacci generator as the
@@ -23,6 +24,7 @@ static int i = 3;
 static int j = 0;
 static uint32_t *x = init+1;
 static volatile int lock[1];
+volatile int *const __random_lockptr = lock;
 
 static uint32_t lcg31(uint32_t x) {
 	return (1103515245*x + 12345) & 0x7fffffff;
diff --git a/src/process/_Fork.c b/src/process/_Fork.c
new file mode 100644
index 00000000..9c07792d
--- /dev/null
+++ b/src/process/_Fork.c
@@ -0,0 +1,43 @@
+#include <unistd.h>
+#include <signal.h>
+#include "syscall.h"
+#include "libc.h"
+#include "lock.h"
+#include "pthread_impl.h"
+#include "aio_impl.h"
+#include "fork_impl.h"
+
+static void dummy(int x) { }
+weak_alias(dummy, __aio_atfork);
+
+void __post_Fork(int ret)
+{
+	if (!ret) {
+		pthread_t self = __pthread_self();
+		self->tid = __syscall(SYS_set_tid_address, &__thread_list_lock);
+		self->robust_list.off = 0;
+		self->robust_list.pending = 0;
+		self->next = self->prev = self;
+		__thread_list_lock = 0;
+		libc.threads_minus_1 = 0;
+		if (libc.need_locks) libc.need_locks = -1;
+	}
+	UNLOCK(__abort_lock);
+	if (!ret) __aio_atfork(1);
+}
+
+pid_t _Fork(void)
+{
+	pid_t ret;
+	sigset_t set;
+	__block_all_sigs(&set);
+	LOCK(__abort_lock);
+#ifdef SYS_fork
+	ret = __syscall(SYS_fork);
+#else
+	ret = __syscall(SYS_clone, SIGCHLD, 0);
+#endif
+	__post_Fork(ret);
+	__restore_sigs(&set);
+	return __syscall_ret(ret);
+}
diff --git a/src/process/aarch64/vfork.s b/src/process/aarch64/vfork.s
new file mode 100644
index 00000000..429bec8c
--- /dev/null
+++ b/src/process/aarch64/vfork.s
@@ -0,0 +1,9 @@
+.global vfork
+.type vfork,%function
+vfork:
+	mov x8, 220    // SYS_clone
+	mov x0, 0x4111 // SIGCHLD | CLONE_VM | CLONE_VFORK
+	mov x1, 0
+	svc 0
+	.hidden __syscall_ret
+	b __syscall_ret
diff --git a/src/process/fdop.h b/src/process/fdop.h
index 5adf1443..7cf733b2 100644
--- a/src/process/fdop.h
+++ b/src/process/fdop.h
@@ -10,3 +10,8 @@ struct fdop {
 	mode_t mode;
 	char path[];
 };
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc undef
+#define free __libc_free
diff --git a/src/process/fork.c b/src/process/fork.c
index fb42478a..56f19313 100644
--- a/src/process/fork.c
+++ b/src/process/fork.c
@@ -1,37 +1,90 @@
 #include <unistd.h>
-#include <string.h>
-#include <signal.h>
-#include "syscall.h"
+#include <errno.h>
 #include "libc.h"
+#include "lock.h"
 #include "pthread_impl.h"
+#include "fork_impl.h"
 
-static void dummy(int x)
-{
-}
+static volatile int *const dummy_lockptr = 0;
+
+weak_alias(dummy_lockptr, __at_quick_exit_lockptr);
+weak_alias(dummy_lockptr, __atexit_lockptr);
+weak_alias(dummy_lockptr, __gettext_lockptr);
+weak_alias(dummy_lockptr, __locale_lockptr);
+weak_alias(dummy_lockptr, __random_lockptr);
+weak_alias(dummy_lockptr, __sem_open_lockptr);
+weak_alias(dummy_lockptr, __stdio_ofl_lockptr);
+weak_alias(dummy_lockptr, __syslog_lockptr);
+weak_alias(dummy_lockptr, __timezone_lockptr);
+weak_alias(dummy_lockptr, __bump_lockptr);
+
+weak_alias(dummy_lockptr, __vmlock_lockptr);
 
+static volatile int *const *const atfork_locks[] = {
+	&__at_quick_exit_lockptr,
+	&__atexit_lockptr,
+	&__gettext_lockptr,
+	&__locale_lockptr,
+	&__random_lockptr,
+	&__sem_open_lockptr,
+	&__stdio_ofl_lockptr,
+	&__syslog_lockptr,
+	&__timezone_lockptr,
+	&__bump_lockptr,
+};
+
+static void dummy(int x) { }
 weak_alias(dummy, __fork_handler);
+weak_alias(dummy, __malloc_atfork);
+weak_alias(dummy, __aio_atfork);
+weak_alias(dummy, __pthread_key_atfork);
+weak_alias(dummy, __ldso_atfork);
+
+static void dummy_0(void) { }
+weak_alias(dummy_0, __tl_lock);
+weak_alias(dummy_0, __tl_unlock);
 
 pid_t fork(void)
 {
-	pid_t ret;
 	sigset_t set;
 	__fork_handler(-1);
-	__block_all_sigs(&set);
-#ifdef SYS_fork
-	ret = __syscall(SYS_fork);
-#else
-	ret = __syscall(SYS_clone, SIGCHLD, 0);
-#endif
-	if (!ret) {
-		pthread_t self = __pthread_self();
-		self->tid = __syscall(SYS_gettid);
-		self->robust_list.off = 0;
-		self->robust_list.pending = 0;
-		self->next = self->prev = self;
-		__thread_list_lock = 0;
-		libc.threads_minus_1 = 0;
+	__block_app_sigs(&set);
+	int need_locks = libc.need_locks > 0;
+	if (need_locks) {
+		__ldso_atfork(-1);
+		__pthread_key_atfork(-1);
+		__aio_atfork(-1);
+		__inhibit_ptc();
+		for (int i=0; i<sizeof atfork_locks/sizeof *atfork_locks; i++)
+			if (*atfork_locks[i]) LOCK(*atfork_locks[i]);
+		__malloc_atfork(-1);
+		__tl_lock();
+	}
+	pthread_t self=__pthread_self(), next=self->next;
+	pid_t ret = _Fork();
+	int errno_save = errno;
+	if (need_locks) {
+		if (!ret) {
+			for (pthread_t td=next; td!=self; td=td->next)
+				td->tid = -1;
+			if (__vmlock_lockptr) {
+				__vmlock_lockptr[0] = 0;
+				__vmlock_lockptr[1] = 0;
+			}
+		}
+		__tl_unlock();
+		__malloc_atfork(!ret);
+		for (int i=0; i<sizeof atfork_locks/sizeof *atfork_locks; i++)
+			if (*atfork_locks[i])
+				if (ret) UNLOCK(*atfork_locks[i]);
+				else **atfork_locks[i] = 0;
+		__release_ptc();
+		if (ret) __aio_atfork(0);
+		__pthread_key_atfork(!ret);
+		__ldso_atfork(!ret);
 	}
 	__restore_sigs(&set);
 	__fork_handler(!ret);
-	return __syscall_ret(ret);
+	if (ret<0) errno = errno_save;
+	return ret;
 }
diff --git a/src/process/posix_spawn.c b/src/process/posix_spawn.c
index 29652197..8294598b 100644
--- a/src/process/posix_spawn.c
+++ b/src/process/posix_spawn.c
@@ -4,8 +4,10 @@
 #include <unistd.h>
 #include <signal.h>
 #include <fcntl.h>
+#include <errno.h>
 #include <sys/wait.h>
 #include "syscall.h"
+#include "lock.h"
 #include "pthread_impl.h"
 #include "fdop.h"
 
@@ -155,7 +157,11 @@ static int child(void *args_vp)
 fail:
 	/* Since sizeof errno < PIPE_BUF, the write is atomic. */
 	ret = -ret;
-	if (ret) while (__syscall(SYS_write, p, &ret, sizeof ret) < 0);
+	if (ret) {
+		int r;
+		do r = __syscall(SYS_write, p, &ret, sizeof ret);
+		while (r<0 && r!=-EPIPE);
+	}
 	_exit(127);
 }
 
@@ -170,9 +176,6 @@ int posix_spawn(pid_t *restrict res, const char *restrict path,
 	int ec=0, cs;
 	struct args args;
 
-	if (pipe2(args.p, O_CLOEXEC))
-		return errno;
-
 	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
 
 	args.path = path;
@@ -182,9 +185,20 @@ int posix_spawn(pid_t *restrict res, const char *restrict path,
 	args.envp = envp;
 	pthread_sigmask(SIG_BLOCK, SIGALL_SET, &args.oldmask);
 
+	/* The lock guards both against seeing a SIGABRT disposition change
+	 * by abort and against leaking the pipe fd to fork-without-exec. */
+	LOCK(__abort_lock);
+
+	if (pipe2(args.p, O_CLOEXEC)) {
+		UNLOCK(__abort_lock);
+		ec = errno;
+		goto fail;
+	}
+
 	pid = __clone(child, stack+sizeof stack,
 		CLONE_VM|CLONE_VFORK|SIGCHLD, &args);
 	close(args.p[1]);
+	UNLOCK(__abort_lock);
 
 	if (pid > 0) {
 		if (read(args.p[0], &ec, sizeof ec) != sizeof ec) ec = 0;
@@ -197,6 +211,7 @@ int posix_spawn(pid_t *restrict res, const char *restrict path,
 
 	if (!ec && res) *res = pid;
 
+fail:
 	pthread_sigmask(SIG_SETMASK, &args.oldmask, 0);
 	pthread_setcancelstate(cs, 0);
 
diff --git a/src/process/posix_spawn_file_actions_addclose.c b/src/process/posix_spawn_file_actions_addclose.c
index cdda5979..0c2ef8fa 100644
--- a/src/process/posix_spawn_file_actions_addclose.c
+++ b/src/process/posix_spawn_file_actions_addclose.c
@@ -5,6 +5,7 @@
 
 int posix_spawn_file_actions_addclose(posix_spawn_file_actions_t *fa, int fd)
 {
+	if (fd < 0) return EBADF;
 	struct fdop *op = malloc(sizeof *op);
 	if (!op) return ENOMEM;
 	op->cmd = FDOP_CLOSE;
diff --git a/src/process/posix_spawn_file_actions_adddup2.c b/src/process/posix_spawn_file_actions_adddup2.c
index 0367498f..addca4d4 100644
--- a/src/process/posix_spawn_file_actions_adddup2.c
+++ b/src/process/posix_spawn_file_actions_adddup2.c
@@ -5,6 +5,7 @@
 
 int posix_spawn_file_actions_adddup2(posix_spawn_file_actions_t *fa, int srcfd, int fd)
 {
+	if (srcfd < 0 || fd < 0) return EBADF;
 	struct fdop *op = malloc(sizeof *op);
 	if (!op) return ENOMEM;
 	op->cmd = FDOP_DUP2;
diff --git a/src/process/posix_spawn_file_actions_addfchdir.c b/src/process/posix_spawn_file_actions_addfchdir.c
index 436c683d..e89ede8c 100644
--- a/src/process/posix_spawn_file_actions_addfchdir.c
+++ b/src/process/posix_spawn_file_actions_addfchdir.c
@@ -6,6 +6,7 @@
 
 int posix_spawn_file_actions_addfchdir_np(posix_spawn_file_actions_t *fa, int fd)
 {
+	if (fd < 0) return EBADF;
 	struct fdop *op = malloc(sizeof *op);
 	if (!op) return ENOMEM;
 	op->cmd = FDOP_FCHDIR;
diff --git a/src/process/posix_spawn_file_actions_addopen.c b/src/process/posix_spawn_file_actions_addopen.c
index 368922c7..82bbcec9 100644
--- a/src/process/posix_spawn_file_actions_addopen.c
+++ b/src/process/posix_spawn_file_actions_addopen.c
@@ -6,6 +6,7 @@
 
 int posix_spawn_file_actions_addopen(posix_spawn_file_actions_t *restrict fa, int fd, const char *restrict path, int flags, mode_t mode)
 {
+	if (fd < 0) return EBADF;
 	struct fdop *op = malloc(sizeof *op + strlen(path) + 1);
 	if (!op) return ENOMEM;
 	op->cmd = FDOP_OPEN;
diff --git a/src/process/riscv64/vfork.s b/src/process/riscv64/vfork.s
new file mode 100644
index 00000000..c93dca23
--- /dev/null
+++ b/src/process/riscv64/vfork.s
@@ -0,0 +1,12 @@
+.global vfork
+.type vfork,@function
+vfork:
+	/* riscv does not have SYS_vfork, so we must use clone instead */
+	/* note: riscv's clone = clone(flags, sp, ptidptr, tls, ctidptr) */
+	li a7, 220
+	li a0, 0x100 | 0x4000 | 17 /* flags = CLONE_VM | CLONE_VFORK | SIGCHLD */
+	mv a1, sp
+	/* the other arguments are ignoreable */
+	ecall
+	.hidden __syscall_ret
+	j __syscall_ret
diff --git a/src/process/waitpid.c b/src/process/waitpid.c
index 1b65bf05..80231862 100644
--- a/src/process/waitpid.c
+++ b/src/process/waitpid.c
@@ -3,5 +3,5 @@
 
 pid_t waitpid(pid_t pid, int *status, int options)
 {
-	return syscall_cp(SYS_wait4, pid, status, options, 0);
+	return sys_wait4_cp(pid, status, options, 0);
 }
diff --git a/src/regex/glob.c b/src/regex/glob.c
index 9de080ed..87bae084 100644
--- a/src/regex/glob.c
+++ b/src/regex/glob.c
@@ -265,7 +265,7 @@ int glob(const char *restrict pat, int flags, int (*errfunc)(const char *path, i
 			if (append(&tail, pat, strlen(pat), 0))
 				return GLOB_NOSPACE;
 			cnt++;
-		} else
+		} else if (!error)
 			return GLOB_NOMATCH;
 	}
 
@@ -306,6 +306,3 @@ void globfree(glob_t *g)
 	g->gl_pathc = 0;
 	g->gl_pathv = NULL;
 }
-
-weak_alias(glob, glob64);
-weak_alias(globfree, globfree64);
diff --git a/src/search/hsearch.c b/src/search/hsearch.c
index b3ac8796..2634a67f 100644
--- a/src/search/hsearch.c
+++ b/src/search/hsearch.c
@@ -41,9 +41,9 @@ static int resize(size_t nel, struct hsearch_data *htab)
 {
 	size_t newsize;
 	size_t i, j;
+	size_t oldsize = htab->__tab->mask + 1;
 	ENTRY *e, *newe;
 	ENTRY *oldtab = htab->__tab->entries;
-	ENTRY *oldend = htab->__tab->entries + htab->__tab->mask + 1;
 
 	if (nel > MAXSIZE)
 		nel = MAXSIZE;
@@ -56,7 +56,7 @@ static int resize(size_t nel, struct hsearch_data *htab)
 	htab->__tab->mask = newsize - 1;
 	if (!oldtab)
 		return 1;
-	for (e = oldtab; e < oldend; e++)
+	for (e = oldtab; e < oldtab + oldsize; e++)
 		if (e->key) {
 			for (i=keyhash(e->key),j=1; ; i+=j++) {
 				newe = htab->__tab->entries + (i & htab->__tab->mask);
diff --git a/src/select/poll.c b/src/select/poll.c
index c84c8a99..7883dfab 100644
--- a/src/select/poll.c
+++ b/src/select/poll.c
@@ -8,8 +8,13 @@ int poll(struct pollfd *fds, nfds_t n, int timeout)
 #ifdef SYS_poll
 	return syscall_cp(SYS_poll, fds, n, timeout);
 #else
+#if SYS_ppoll_time64 == SYS_ppoll
+	typedef long long ppoll_ts_t[2];
+#else
+	typedef long ppoll_ts_t[2];
+#endif
 	return syscall_cp(SYS_ppoll, fds, n, timeout>=0 ?
-		&((struct timespec){ .tv_sec = timeout/1000,
-		.tv_nsec = timeout%1000*1000000 }) : 0, 0, _NSIG/8);
+		((ppoll_ts_t){ timeout/1000, timeout%1000*1000000 }) : 0,
+		0, _NSIG/8);
 #endif
 }
diff --git a/src/linux/ppoll.c b/src/select/ppoll.c
index e614600a..9a0bf929 100644
--- a/src/linux/ppoll.c
+++ b/src/select/ppoll.c
@@ -1,4 +1,4 @@
-#define _GNU_SOURCE
+#define _BSD_SOURCE
 #include <poll.h>
 #include <signal.h>
 #include <errno.h>
diff --git a/src/select/select.c b/src/select/select.c
index 8a786884..f1d72863 100644
--- a/src/select/select.c
+++ b/src/select/select.c
@@ -33,6 +33,7 @@ int select(int n, fd_set *restrict rfds, fd_set *restrict wfds, fd_set *restrict
 			((syscall_arg_t[]){ 0, _NSIG/8 }));
 	if (SYS_pselect6 == SYS_pselect6_time64 || r!=-ENOSYS)
 		return __syscall_ret(r);
+	s = CLAMP(s);
 #endif
 #ifdef SYS_select
 	return syscall_cp(SYS_select, n, rfds, wfds, efds,
diff --git a/src/setjmp/aarch64/longjmp.s b/src/setjmp/aarch64/longjmp.s
index 7c4655fa..0af9c50e 100644
--- a/src/setjmp/aarch64/longjmp.s
+++ b/src/setjmp/aarch64/longjmp.s
@@ -18,7 +18,6 @@ longjmp:
 	ldp d12, d13, [x0,#144]
 	ldp d14, d15, [x0,#160]
 
-	mov x0, x1
-	cbnz x1, 1f
-	mov x0, #1
-1:	br x30
+	cmp w1, 0
+	csinc w0, w1, wzr, ne
+	br x30
diff --git a/src/setjmp/i386/longjmp.s b/src/setjmp/i386/longjmp.s
index 772d28dd..8188f06b 100644
--- a/src/setjmp/i386/longjmp.s
+++ b/src/setjmp/i386/longjmp.s
@@ -6,15 +6,11 @@ _longjmp:
 longjmp:
 	mov  4(%esp),%edx
 	mov  8(%esp),%eax
-	test    %eax,%eax
-	jnz 1f
-	inc     %eax
-1:
+	cmp       $1,%eax
+	adc       $0, %al
 	mov   (%edx),%ebx
 	mov  4(%edx),%esi
 	mov  8(%edx),%edi
 	mov 12(%edx),%ebp
-	mov 16(%edx),%ecx
-	mov     %ecx,%esp
-	mov 20(%edx),%ecx
-	jmp *%ecx
+	mov 16(%edx),%esp
+	jmp *20(%edx)
diff --git a/src/setjmp/loongarch64/longjmp.S b/src/setjmp/loongarch64/longjmp.S
new file mode 100644
index 00000000..896d2e26
--- /dev/null
+++ b/src/setjmp/loongarch64/longjmp.S
@@ -0,0 +1,32 @@
+.global _longjmp
+.global longjmp
+.type   _longjmp,@function
+.type   longjmp,@function
+_longjmp:
+longjmp:
+	ld.d    $ra, $a0, 0
+	ld.d    $sp, $a0, 8
+	ld.d    $r21,$a0, 16
+	ld.d    $fp, $a0, 24
+	ld.d    $s0, $a0, 32
+	ld.d    $s1, $a0, 40
+	ld.d    $s2, $a0, 48
+	ld.d    $s3, $a0, 56
+	ld.d    $s4, $a0, 64
+	ld.d    $s5, $a0, 72
+	ld.d    $s6, $a0, 80
+	ld.d    $s7, $a0, 88
+	ld.d    $s8, $a0, 96
+#ifndef __loongarch_soft_float
+	fld.d   $fs0, $a0, 104
+	fld.d   $fs1, $a0, 112
+	fld.d   $fs2, $a0, 120
+	fld.d   $fs3, $a0, 128
+	fld.d   $fs4, $a0, 136
+	fld.d   $fs5, $a0, 144
+	fld.d   $fs6, $a0, 152
+	fld.d   $fs7, $a0, 160
+#endif
+	sltui   $a0, $a1, 1
+	add.d   $a0, $a0, $a1
+	jr      $ra
diff --git a/src/setjmp/loongarch64/setjmp.S b/src/setjmp/loongarch64/setjmp.S
new file mode 100644
index 00000000..d158a3d2
--- /dev/null
+++ b/src/setjmp/loongarch64/setjmp.S
@@ -0,0 +1,34 @@
+.global __setjmp
+.global _setjmp
+.global setjmp
+.type __setjmp,@function
+.type _setjmp,@function
+.type setjmp,@function
+__setjmp:
+_setjmp:
+setjmp:
+	st.d    $ra, $a0, 0
+	st.d    $sp, $a0, 8
+	st.d    $r21,$a0, 16
+	st.d    $fp, $a0, 24
+	st.d    $s0, $a0, 32
+	st.d    $s1, $a0, 40
+	st.d    $s2, $a0, 48
+	st.d    $s3, $a0, 56
+	st.d    $s4, $a0, 64
+	st.d    $s5, $a0, 72
+	st.d    $s6, $a0, 80
+	st.d    $s7, $a0, 88
+	st.d    $s8, $a0, 96
+#ifndef __loongarch_soft_float
+	fst.d   $fs0, $a0, 104
+	fst.d   $fs1, $a0, 112
+	fst.d   $fs2, $a0, 120
+	fst.d   $fs3, $a0, 128
+	fst.d   $fs4, $a0, 136
+	fst.d   $fs5, $a0, 144
+	fst.d   $fs6, $a0, 152
+	fst.d   $fs7, $a0, 160
+#endif
+	move    $a0, $zero
+	jr      $ra
diff --git a/src/setjmp/powerpc/longjmp.S b/src/setjmp/powerpc/longjmp.S
index e598bd05..465e4cd7 100644
--- a/src/setjmp/powerpc/longjmp.S
+++ b/src/setjmp/powerpc/longjmp.S
@@ -37,7 +37,37 @@ longjmp:
 	lwz 29, 72(3)
 	lwz 30, 76(3)
 	lwz 31, 80(3)
-#ifndef _SOFT_FLOAT
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)
+	mflr 0
+	bl 1f
+	.hidden __hwcap
+	.long __hwcap-.
+1:	mflr 6
+	lwz 5, 0(6)
+	lwzx 6, 6, 5
+	andis. 6, 6, 0x80
+	beq 1f
+	.long 0x11c35b01 /* evldd 14,88(3) */
+	.long 0x11e36301 /* ... */
+	.long 0x12036b01
+	.long 0x12237301
+	.long 0x12437b01
+	.long 0x12638301
+	.long 0x12838b01
+	.long 0x12a39301
+	.long 0x12c39b01
+	.long 0x12e3a301
+	.long 0x1303ab01
+	.long 0x1323b301
+	.long 0x1343bb01
+	.long 0x1363c301
+	.long 0x1383cb01
+	.long 0x13a3d301
+	.long 0x13c3db01
+	.long 0x13e3e301 /* evldd 31,224(3) */
+	.long 0x11a3eb01 /* evldd 13,232(3) */
+1:	mtlr 0
+#else
 	lfd 14,88(3)
 	lfd 15,96(3)
 	lfd 16,104(3)
diff --git a/src/setjmp/powerpc/setjmp.S b/src/setjmp/powerpc/setjmp.S
index cd91a207..f1fcce33 100644
--- a/src/setjmp/powerpc/setjmp.S
+++ b/src/setjmp/powerpc/setjmp.S
@@ -37,7 +37,37 @@ setjmp:
 	stw 29, 72(3)
 	stw 30, 76(3)
 	stw 31, 80(3)
-#ifndef _SOFT_FLOAT
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)
+	mflr 0
+	bl 1f
+	.hidden __hwcap
+	.long __hwcap-.
+1:	mflr 4
+	lwz 5, 0(4)
+	lwzx 4, 4, 5
+	andis. 4, 4, 0x80
+	beq 1f
+	.long 0x11c35b21 /* evstdd 14,88(3) */
+	.long 0x11e36321 /* ... */
+	.long 0x12036b21
+	.long 0x12237321
+	.long 0x12437b21
+	.long 0x12638321
+	.long 0x12838b21
+	.long 0x12a39321
+	.long 0x12c39b21
+	.long 0x12e3a321
+	.long 0x1303ab21
+	.long 0x1323b321
+	.long 0x1343bb21
+	.long 0x1363c321
+	.long 0x1383cb21
+	.long 0x13a3d321
+	.long 0x13c3db21
+	.long 0x13e3e321 /* evstdd 31,224(3) */
+	.long 0x11a3eb21 /* evstdd 13,232(3) */
+1:	mtlr 0
+#else
 	stfd 14,88(3)
 	stfd 15,96(3)
 	stfd 16,104(3)
diff --git a/src/setjmp/riscv32/longjmp.S b/src/setjmp/riscv32/longjmp.S
new file mode 100644
index 00000000..f9cb3318
--- /dev/null
+++ b/src/setjmp/riscv32/longjmp.S
@@ -0,0 +1,42 @@
+.global __longjmp
+.global _longjmp
+.global longjmp
+.type __longjmp, %function
+.type _longjmp,  %function
+.type longjmp,   %function
+__longjmp:
+_longjmp:
+longjmp:
+	lw s0,    0(a0)
+	lw s1,    4(a0)
+	lw s2,    8(a0)
+	lw s3,    12(a0)
+	lw s4,    16(a0)
+	lw s5,    20(a0)
+	lw s6,    24(a0)
+	lw s7,    28(a0)
+	lw s8,    32(a0)
+	lw s9,    36(a0)
+	lw s10,   40(a0)
+	lw s11,   44(a0)
+	lw sp,    48(a0)
+	lw ra,    52(a0)
+
+#ifndef __riscv_float_abi_soft
+	fld fs0,  56(a0)
+	fld fs1,  64(a0)
+	fld fs2,  72(a0)
+	fld fs3,  80(a0)
+	fld fs4,  88(a0)
+	fld fs5,  96(a0)
+	fld fs6,  104(a0)
+	fld fs7,  112(a0)
+	fld fs8,  120(a0)
+	fld fs9,  128(a0)
+	fld fs10, 136(a0)
+	fld fs11, 144(a0)
+#endif
+
+	seqz a0, a1
+	add a0, a0, a1
+	ret
diff --git a/src/setjmp/riscv32/setjmp.S b/src/setjmp/riscv32/setjmp.S
new file mode 100644
index 00000000..8a75cf55
--- /dev/null
+++ b/src/setjmp/riscv32/setjmp.S
@@ -0,0 +1,41 @@
+.global __setjmp
+.global _setjmp
+.global setjmp
+.type __setjmp, %function
+.type _setjmp,  %function
+.type setjmp,   %function
+__setjmp:
+_setjmp:
+setjmp:
+	sw s0,    0(a0)
+	sw s1,    4(a0)
+	sw s2,    8(a0)
+	sw s3,    12(a0)
+	sw s4,    16(a0)
+	sw s5,    20(a0)
+	sw s6,    24(a0)
+	sw s7,    28(a0)
+	sw s8,    32(a0)
+	sw s9,    36(a0)
+	sw s10,   40(a0)
+	sw s11,   44(a0)
+	sw sp,    48(a0)
+	sw ra,    52(a0)
+
+#ifndef __riscv_float_abi_soft
+	fsd fs0,  56(a0)
+	fsd fs1,  64(a0)
+	fsd fs2,  72(a0)
+	fsd fs3,  80(a0)
+	fsd fs4,  88(a0)
+	fsd fs5,  96(a0)
+	fsd fs6,  104(a0)
+	fsd fs7,  112(a0)
+	fsd fs8,  120(a0)
+	fsd fs9,  128(a0)
+	fsd fs10, 136(a0)
+	fsd fs11, 144(a0)
+#endif
+
+	li a0, 0
+	ret
diff --git a/src/setjmp/x32/longjmp.s b/src/setjmp/x32/longjmp.s
index e175a4b9..1b2661c3 100644
--- a/src/setjmp/x32/longjmp.s
+++ b/src/setjmp/x32/longjmp.s
@@ -5,18 +5,14 @@
 .type longjmp,@function
 _longjmp:
 longjmp:
-	mov %rsi,%rax           /* val will be longjmp return */
-	test %rax,%rax
-	jnz 1f
-	inc %rax                /* if val==0, val=1 per longjmp semantics */
-1:
+	xor %eax,%eax
+	cmp $1,%esi             /* CF = val ? 0 : 1 */
+	adc %esi,%eax           /* eax = val + !val */
 	mov (%rdi),%rbx         /* rdi is the jmp_buf, restore regs from it */
 	mov 8(%rdi),%rbp
 	mov 16(%rdi),%r12
 	mov 24(%rdi),%r13
 	mov 32(%rdi),%r14
 	mov 40(%rdi),%r15
-	mov 48(%rdi),%rdx       /* this ends up being the stack pointer */
-	mov %rdx,%rsp
-	mov 56(%rdi),%rdx       /* this is the instruction pointer */
-	jmp *%rdx               /* goto saved address without altering rsp */
+	mov 48(%rdi),%rsp
+	jmp *56(%rdi)           /* goto saved address without altering rsp */
diff --git a/src/setjmp/x32/setjmp.s b/src/setjmp/x32/setjmp.s
index 98f58b8d..d95e4853 100644
--- a/src/setjmp/x32/setjmp.s
+++ b/src/setjmp/x32/setjmp.s
@@ -18,5 +18,5 @@ setjmp:
 	mov %rdx,48(%rdi)
 	mov (%rsp),%rdx         /* save return addr ptr for new rip */
 	mov %rdx,56(%rdi)
-	xor %rax,%rax           /* always return 0 */
+	xor %eax,%eax           /* always return 0 */
 	ret
diff --git a/src/setjmp/x86_64/longjmp.s b/src/setjmp/x86_64/longjmp.s
index e175a4b9..1b2661c3 100644
--- a/src/setjmp/x86_64/longjmp.s
+++ b/src/setjmp/x86_64/longjmp.s
@@ -5,18 +5,14 @@
 .type longjmp,@function
 _longjmp:
 longjmp:
-	mov %rsi,%rax           /* val will be longjmp return */
-	test %rax,%rax
-	jnz 1f
-	inc %rax                /* if val==0, val=1 per longjmp semantics */
-1:
+	xor %eax,%eax
+	cmp $1,%esi             /* CF = val ? 0 : 1 */
+	adc %esi,%eax           /* eax = val + !val */
 	mov (%rdi),%rbx         /* rdi is the jmp_buf, restore regs from it */
 	mov 8(%rdi),%rbp
 	mov 16(%rdi),%r12
 	mov 24(%rdi),%r13
 	mov 32(%rdi),%r14
 	mov 40(%rdi),%r15
-	mov 48(%rdi),%rdx       /* this ends up being the stack pointer */
-	mov %rdx,%rsp
-	mov 56(%rdi),%rdx       /* this is the instruction pointer */
-	jmp *%rdx               /* goto saved address without altering rsp */
+	mov 48(%rdi),%rsp
+	jmp *56(%rdi)           /* goto saved address without altering rsp */
diff --git a/src/setjmp/x86_64/setjmp.s b/src/setjmp/x86_64/setjmp.s
index 98f58b8d..d95e4853 100644
--- a/src/setjmp/x86_64/setjmp.s
+++ b/src/setjmp/x86_64/setjmp.s
@@ -18,5 +18,5 @@ setjmp:
 	mov %rdx,48(%rdi)
 	mov (%rsp),%rdx         /* save return addr ptr for new rip */
 	mov %rdx,56(%rdi)
-	xor %rax,%rax           /* always return 0 */
+	xor %eax,%eax           /* always return 0 */
 	ret
diff --git a/src/signal/block.c b/src/signal/block.c
index d7f61001..cc8698f0 100644
--- a/src/signal/block.c
+++ b/src/signal/block.c
@@ -3,9 +3,9 @@
 #include <signal.h>
 
 static const unsigned long all_mask[] = {
-#if ULONG_MAX == 0xffffffff && _NSIG == 129
+#if ULONG_MAX == 0xffffffff && _NSIG > 65
 	-1UL, -1UL, -1UL, -1UL
-#elif ULONG_MAX == 0xffffffff
+#elif ULONG_MAX == 0xffffffff || _NSIG > 65
 	-1UL, -1UL
 #else
 	-1UL
diff --git a/src/signal/mips/restore.s b/src/signal/loongarch64/restore.s
index b6dadce0..d90a8ebb 100644
--- a/src/signal/mips/restore.s
+++ b/src/signal/loongarch64/restore.s
@@ -1,15 +1,10 @@
-.set noreorder
-
 .global __restore_rt
-.hidden __restore_rt
-.type   __restore_rt,@function
-__restore_rt:
-	li $2, 4193
-	syscall
-
 .global __restore
+.hidden __restore_rt
 .hidden __restore
+.type   __restore_rt,@function
 .type   __restore,@function
+__restore_rt:
 __restore:
-	li $2, 4119
-	syscall
+	li.w    $a7, 139
+	syscall 0
diff --git a/src/signal/loongarch64/sigsetjmp.s b/src/signal/loongarch64/sigsetjmp.s
new file mode 100644
index 00000000..9c0e3ae2
--- /dev/null
+++ b/src/signal/loongarch64/sigsetjmp.s
@@ -0,0 +1,25 @@
+.global sigsetjmp
+.global __sigsetjmp
+.type sigsetjmp,@function
+.type __sigsetjmp,@function
+sigsetjmp:
+__sigsetjmp:
+	beq     $a1, $zero, 1f
+	st.d    $ra, $a0, 184
+	st.d    $s0, $a0, 200  #184+8+8
+	move    $s0, $a0
+
+	la.global  $t0, setjmp
+	jirl       $ra, $t0, 0
+
+	move    $a1, $a0        # Return from 'setjmp' or 'longjmp'
+	move    $a0, $s0
+	ld.d    $ra, $a0, 184
+	ld.d    $s0, $a0, 200 #184+8+8
+
+.hidden __sigsetjmp_tail
+	la.global  $t0, __sigsetjmp_tail
+	jr         $t0
+1:
+	la.global  $t0, setjmp
+	jr         $t0
diff --git a/src/signal/mips64/restore.s b/src/signal/mips64/restore.s
deleted file mode 100644
index 401f8e73..00000000
--- a/src/signal/mips64/restore.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.set	noreorder
-.global	__restore_rt
-.global	__restore
-.hidden __restore_rt
-.hidden __restore
-.type	__restore_rt,@function
-.type	__restore,@function
-__restore_rt:
-__restore:
-	li	$2,5211
-	syscall
diff --git a/src/signal/mipsn32/restore.s b/src/signal/mipsn32/restore.s
deleted file mode 100644
index 4cd4e1b4..00000000
--- a/src/signal/mipsn32/restore.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.set	noreorder
-.global	__restore_rt
-.global	__restore
-.hidden __restore_rt
-.hidden __restore
-.type	__restore_rt,@function
-.type	__restore,@function
-__restore_rt:
-__restore:
-	li	$2,6211
-	syscall
diff --git a/src/signal/riscv32/restore.s b/src/signal/riscv32/restore.s
new file mode 100644
index 00000000..5a0af695
--- /dev/null
+++ b/src/signal/riscv32/restore.s
@@ -0,0 +1,10 @@
+.global __restore
+.hidden __restore
+.type __restore, %function
+__restore:
+.global __restore_rt
+.hidden __restore_rt
+.type __restore_rt, %function
+__restore_rt:
+	li a7, 139 # SYS_rt_sigreturn
+	ecall
diff --git a/src/signal/riscv32/sigsetjmp.s b/src/signal/riscv32/sigsetjmp.s
new file mode 100644
index 00000000..c1caeab1
--- /dev/null
+++ b/src/signal/riscv32/sigsetjmp.s
@@ -0,0 +1,23 @@
+.global sigsetjmp
+.global __sigsetjmp
+.type sigsetjmp, %function
+.type __sigsetjmp, %function
+sigsetjmp:
+__sigsetjmp:
+	bnez a1, 1f
+	tail setjmp
+1:
+
+	sw ra, 152(a0)
+	sw s0, 164(a0)
+	mv s0, a0
+
+	call setjmp
+
+	mv a1, a0
+	mv a0, s0
+	lw s0, 164(a0)
+	lw ra, 152(a0)
+
+.hidden __sigsetjmp_tail
+	tail __sigsetjmp_tail
diff --git a/src/signal/riscv64/restore.s b/src/signal/riscv64/restore.s
index 40012c75..5a0af695 100644
--- a/src/signal/riscv64/restore.s
+++ b/src/signal/riscv64/restore.s
@@ -1,7 +1,9 @@
 .global __restore
+.hidden __restore
 .type __restore, %function
 __restore:
 .global __restore_rt
+.hidden __restore_rt
 .type __restore_rt, %function
 __restore_rt:
 	li a7, 139 # SYS_rt_sigreturn
diff --git a/src/signal/sh/sigsetjmp.s b/src/signal/sh/sigsetjmp.s
index 1e2270be..f0f604e2 100644
--- a/src/signal/sh/sigsetjmp.s
+++ b/src/signal/sh/sigsetjmp.s
@@ -27,7 +27,7 @@ __sigsetjmp:
 
 	mov.l 3f, r0
 4:	braf r0
-	 mov.l @(4+8,r4), r8
+	 mov.l @(4+8,r6), r8
 
 9:	mov.l 5f, r0
 6:	braf r0
diff --git a/src/signal/sigaction.c b/src/signal/sigaction.c
index c109bea0..e45308fa 100644
--- a/src/signal/sigaction.c
+++ b/src/signal/sigaction.c
@@ -7,12 +7,6 @@
 #include "lock.h"
 #include "ksigaction.h"
 
-static volatile int dummy_lock[1] = { 0 };
-
-extern hidden volatile int __abort_lock[1];
-
-weak_alias(dummy_lock, __abort_lock);
-
 static int unmask_done;
 static unsigned long handler_set[_NSIG/(8*sizeof(long))];
 
@@ -26,7 +20,6 @@ volatile int __eintr_valid_flag;
 int __libc_sigaction(int sig, const struct sigaction *restrict sa, struct sigaction *restrict old)
 {
 	struct k_sigaction ksa, ksa_old;
-	unsigned long set[_NSIG/(8*sizeof(long))];
 	if (sa) {
 		if ((uintptr_t)sa->sa_handler > 1UL) {
 			a_or_l(handler_set+(sig-1)/(8*sizeof(long)),
@@ -50,24 +43,15 @@ int __libc_sigaction(int sig, const struct sigaction *restrict sa, struct sigact
 				a_store(&__eintr_valid_flag, 1);
 			}
 		}
-		/* Changing the disposition of SIGABRT to anything but
-		 * SIG_DFL requires a lock, so that it cannot be changed
-		 * while abort is terminating the process after simply
-		 * calling raise(SIGABRT) failed to do so. */
-		if (sa->sa_handler != SIG_DFL && sig == SIGABRT) {
-			__block_all_sigs(&set);
-			LOCK(__abort_lock);
-		}
 		ksa.handler = sa->sa_handler;
-		ksa.flags = sa->sa_flags | SA_RESTORER;
+		ksa.flags = sa->sa_flags;
+#ifdef SA_RESTORER
+		ksa.flags |= SA_RESTORER;
 		ksa.restorer = (sa->sa_flags & SA_SIGINFO) ? __restore_rt : __restore;
+#endif
 		memcpy(&ksa.mask, &sa->sa_mask, _NSIG/8);
 	}
 	int r = __syscall(SYS_rt_sigaction, sig, sa?&ksa:0, old?&ksa_old:0, _NSIG/8);
-	if (sig == SIGABRT && sa && sa->sa_handler != SIG_DFL) {
-		UNLOCK(__abort_lock);
-		__restore_sigs(&set);
-	}
 	if (old && !r) {
 		old->sa_handler = ksa_old.handler;
 		old->sa_flags = ksa_old.flags;
@@ -78,11 +62,26 @@ int __libc_sigaction(int sig, const struct sigaction *restrict sa, struct sigact
 
 int __sigaction(int sig, const struct sigaction *restrict sa, struct sigaction *restrict old)
 {
+	unsigned long set[_NSIG/(8*sizeof(long))];
+
 	if (sig-32U < 3 || sig-1U >= _NSIG-1) {
 		errno = EINVAL;
 		return -1;
 	}
-	return __libc_sigaction(sig, sa, old);
+
+	/* Doing anything with the disposition of SIGABRT requires a lock,
+	 * so that it cannot be changed while abort is terminating the
+	 * process and so any change made by abort can't be observed. */
+	if (sig == SIGABRT) {
+		__block_all_sigs(&set);
+		LOCK(__abort_lock);
+	}
+	int r = __libc_sigaction(sig, sa, old);
+	if (sig == SIGABRT) {
+		UNLOCK(__abort_lock);
+		__restore_sigs(&set);
+	}
+	return r;
 }
 
 weak_alias(__sigaction, sigaction);
diff --git a/src/signal/sigaltstack.c b/src/signal/sigaltstack.c
index d3a6e821..616625c5 100644
--- a/src/signal/sigaltstack.c
+++ b/src/signal/sigaltstack.c
@@ -1,11 +1,13 @@
 #include <signal.h>
 #include <errno.h>
+#include <unistd.h>
 #include "syscall.h"
 
 int sigaltstack(const stack_t *restrict ss, stack_t *restrict old)
 {
 	if (ss) {
-		if (!(ss->ss_flags & SS_DISABLE) && ss->ss_size < MINSIGSTKSZ) {
+		size_t min = sysconf(_SC_MINSIGSTKSZ);
+		if (!(ss->ss_flags & SS_DISABLE) && ss->ss_size < min) {
 			errno = ENOMEM;
 			return -1;
 		}
diff --git a/src/signal/siglongjmp.c b/src/signal/siglongjmp.c
index bc317acc..53789b23 100644
--- a/src/signal/siglongjmp.c
+++ b/src/signal/siglongjmp.c
@@ -5,5 +5,10 @@
 
 _Noreturn void siglongjmp(sigjmp_buf buf, int ret)
 {
+	/* If sigsetjmp was called with nonzero savemask flag, the address
+	 * longjmp will return to is inside of sigsetjmp. The signal mask
+	 * will then be restored in the returned-to context instead of here,
+	 * which matters if the context we are returning from may not have
+	 * sufficient stack space for signal delivery. */
 	longjmp(buf, ret);
 }
diff --git a/src/signal/sigpause.c b/src/signal/sigpause.c
index 363d2fec..1587c391 100644
--- a/src/signal/sigpause.c
+++ b/src/signal/sigpause.c
@@ -4,6 +4,6 @@ int sigpause(int sig)
 {
 	sigset_t mask;
 	sigprocmask(0, 0, &mask);
-	sigdelset(&mask, sig);
+	if (sigdelset(&mask, sig)) return -1;
 	return sigsuspend(&mask);
 }
diff --git a/src/stat/__xstat.c b/src/stat/__xstat.c
index 630936a0..b4560df7 100644
--- a/src/stat/__xstat.c
+++ b/src/stat/__xstat.c
@@ -22,11 +22,6 @@ int __xstat(int ver, const char *path, struct stat *buf)
 	return stat(path, buf);
 }
 
-weak_alias(__fxstat, __fxstat64);
-weak_alias(__fxstatat, __fxstatat64);
-weak_alias(__lxstat, __lxstat64);
-weak_alias(__xstat, __xstat64);
-
 #endif
 
 int __xmknod(int ver, const char *path, mode_t mode, dev_t *dev)
diff --git a/src/stat/fchmodat.c b/src/stat/fchmodat.c
index be61bdf3..92c9d1b0 100644
--- a/src/stat/fchmodat.c
+++ b/src/stat/fchmodat.c
@@ -5,17 +5,20 @@
 
 int fchmodat(int fd, const char *path, mode_t mode, int flag)
 {
-	if (!flag) return syscall(SYS_fchmodat, fd, path, mode, flag);
+	if (!flag) return syscall(SYS_fchmodat, fd, path, mode);
+
+	int ret = __syscall(SYS_fchmodat2, fd, path, mode, flag);
+	if (ret != -ENOSYS) return __syscall_ret(ret);
 
 	if (flag != AT_SYMLINK_NOFOLLOW)
 		return __syscall_ret(-EINVAL);
 
 	struct stat st;
-	int ret, fd2;
+	int fd2;
 	char proc[15+3*sizeof(int)];
 
-	if ((ret = __syscall(SYS_fstatat, fd, path, &st, flag)))
-		return __syscall_ret(ret);
+	if (fstatat(fd, path, &st, flag))
+		return -1;
 	if (S_ISLNK(st.st_mode))
 		return __syscall_ret(-EOPNOTSUPP);
 
@@ -26,12 +29,12 @@ int fchmodat(int fd, const char *path, mode_t mode, int flag)
 	}
 
 	__procfdname(proc, fd2);
-	ret = __syscall(SYS_fstatat, AT_FDCWD, proc, &st, 0);
+	ret = stat(proc, &st);
 	if (!ret) {
-		if (S_ISLNK(st.st_mode)) ret = -EOPNOTSUPP;
-		else ret = __syscall(SYS_fchmodat, AT_FDCWD, proc, mode);
+		if (S_ISLNK(st.st_mode)) ret = __syscall_ret(-EOPNOTSUPP);
+		else ret = syscall(SYS_fchmodat, AT_FDCWD, proc, mode);
 	}
 
 	__syscall(SYS_close, fd2);
-	return __syscall_ret(ret);
+	return ret;
 }
diff --git a/src/stat/fstat.c b/src/stat/fstat.c
index 9bbb46de..fd28b8ac 100644
--- a/src/stat/fstat.c
+++ b/src/stat/fstat.c
@@ -4,12 +4,10 @@
 #include <fcntl.h>
 #include "syscall.h"
 
-int fstat(int fd, struct stat *st)
+int __fstat(int fd, struct stat *st)
 {
 	if (fd<0) return __syscall_ret(-EBADF);
-	return fstatat(fd, "", st, AT_EMPTY_PATH);
+	return __fstatat(fd, "", st, AT_EMPTY_PATH);
 }
 
-#if !_REDIR_TIME64
-weak_alias(fstat, fstat64);
-#endif
+weak_alias(__fstat, fstat);
diff --git a/src/stat/fstatat.c b/src/stat/fstatat.c
index de165b5c..9eed063b 100644
--- a/src/stat/fstatat.c
+++ b/src/stat/fstatat.c
@@ -6,7 +6,6 @@
 #include <stdint.h>
 #include <sys/sysmacros.h>
 #include "syscall.h"
-#include "kstat.h"
 
 struct statx {
 	uint32_t stx_mask;
@@ -37,6 +36,7 @@ static int fstatat_statx(int fd, const char *restrict path, struct stat *restric
 {
 	struct statx stx;
 
+	flag |= AT_NO_AUTOMOUNT;
 	int ret = __syscall(SYS_statx, fd, path, flag, 0x7ff, &stx);
 	if (ret) return ret;
 
@@ -69,6 +69,10 @@ static int fstatat_statx(int fd, const char *restrict path, struct stat *restric
 	return 0;
 }
 
+#ifdef SYS_fstatat
+
+#include "kstat.h"
+
 static int fstatat_kstat(int fd, const char *restrict path, struct stat *restrict st, int flag)
 {
 	int ret;
@@ -130,18 +134,21 @@ static int fstatat_kstat(int fd, const char *restrict path, struct stat *restric
 
 	return 0;
 }
+#endif
 
-int fstatat(int fd, const char *restrict path, struct stat *restrict st, int flag)
+int __fstatat(int fd, const char *restrict path, struct stat *restrict st, int flag)
 {
 	int ret;
+#ifdef SYS_fstatat
 	if (sizeof((struct kstat){0}.st_atime_sec) < sizeof(time_t)) {
 		ret = fstatat_statx(fd, path, st, flag);
 		if (ret!=-ENOSYS) return __syscall_ret(ret);
 	}
 	ret = fstatat_kstat(fd, path, st, flag);
+#else
+	ret = fstatat_statx(fd, path, st, flag);
+#endif
 	return __syscall_ret(ret);
 }
 
-#if !_REDIR_TIME64
-weak_alias(fstatat, fstatat64);
-#endif
+weak_alias(__fstatat, fstatat);
diff --git a/src/stat/lstat.c b/src/stat/lstat.c
index 6fe004de..6822fcae 100644
--- a/src/stat/lstat.c
+++ b/src/stat/lstat.c
@@ -5,7 +5,3 @@ int lstat(const char *restrict path, struct stat *restrict buf)
 {
 	return fstatat(AT_FDCWD, path, buf, AT_SYMLINK_NOFOLLOW);
 }
-
-#if !_REDIR_TIME64
-weak_alias(lstat, lstat64);
-#endif
diff --git a/src/stat/stat.c b/src/stat/stat.c
index ea70efc4..23570e7a 100644
--- a/src/stat/stat.c
+++ b/src/stat/stat.c
@@ -5,7 +5,3 @@ int stat(const char *restrict path, struct stat *restrict buf)
 {
 	return fstatat(AT_FDCWD, path, buf, 0);
 }
-
-#if !_REDIR_TIME64
-weak_alias(stat, stat64);
-#endif
diff --git a/src/stat/statvfs.c b/src/stat/statvfs.c
index f65d1b54..bc12da8b 100644
--- a/src/stat/statvfs.c
+++ b/src/stat/statvfs.c
@@ -39,6 +39,7 @@ static void fixup(struct statvfs *out, const struct statfs *in)
 	out->f_fsid = in->f_fsid.__val[0];
 	out->f_flag = in->f_flags;
 	out->f_namemax = in->f_namelen;
+	out->f_type = in->f_type;
 }
 
 int statvfs(const char *restrict path, struct statvfs *restrict buf)
@@ -56,8 +57,3 @@ int fstatvfs(int fd, struct statvfs *buf)
 	fixup(buf, &kbuf);
 	return 0;
 }
-
-weak_alias(statvfs, statvfs64);
-weak_alias(statfs, statfs64);
-weak_alias(fstatvfs, fstatvfs64);
-weak_alias(fstatfs, fstatfs64);
diff --git a/src/stdio/__stdio_close.c b/src/stdio/__stdio_close.c
index 79452bdb..30291328 100644
--- a/src/stdio/__stdio_close.c
+++ b/src/stdio/__stdio_close.c
@@ -1,4 +1,5 @@
 #include "stdio_impl.h"
+#include "aio_impl.h"
 
 static int dummy(int fd)
 {
diff --git a/src/stdio/__stdio_write.c b/src/stdio/__stdio_write.c
index d2d89475..5356553d 100644
--- a/src/stdio/__stdio_write.c
+++ b/src/stdio/__stdio_write.c
@@ -11,6 +11,11 @@ size_t __stdio_write(FILE *f, const unsigned char *buf, size_t len)
 	size_t rem = iov[0].iov_len + iov[1].iov_len;
 	int iovcnt = 2;
 	ssize_t cnt;
+
+	if (!iov->iov_len) {
+		iov++;
+		iovcnt--;
+	}
 	for (;;) {
 		cnt = syscall(SYS_writev, f->fd, iov, iovcnt);
 		if (cnt == rem) {
diff --git a/src/stdio/__string_read.c b/src/stdio/__string_read.c
deleted file mode 100644
index 7b50a7e1..00000000
--- a/src/stdio/__string_read.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "stdio_impl.h"
-#include <string.h>
-
-size_t __string_read(FILE *f, unsigned char *buf, size_t len)
-{
-	char *src = f->cookie;
-	size_t k = len+256;
-	char *end = memchr(src, 0, k);
-	if (end) k = end-src;
-	if (k < len) len = k;
-	memcpy(buf, src, len);
-	f->rpos = (void *)(src+len);
-	f->rend = (void *)(src+k);
-	f->cookie = src+k;
-	return len;
-}
diff --git a/src/stdio/fgetpos.c b/src/stdio/fgetpos.c
index 50813d2c..392f7323 100644
--- a/src/stdio/fgetpos.c
+++ b/src/stdio/fgetpos.c
@@ -7,5 +7,3 @@ int fgetpos(FILE *restrict f, fpos_t *restrict pos)
 	*(long long *)pos = off;
 	return 0;
 }
-
-weak_alias(fgetpos, fgetpos64);
diff --git a/src/stdio/fgets.c b/src/stdio/fgets.c
index 6171f398..4a100b39 100644
--- a/src/stdio/fgets.c
+++ b/src/stdio/fgets.c
@@ -12,13 +12,14 @@ char *fgets(char *restrict s, int n, FILE *restrict f)
 
 	FLOCK(f);
 
-	if (n--<=1) {
+	if (n<=1) {
 		f->mode |= f->mode-1;
 		FUNLOCK(f);
-		if (n) return 0;
+		if (n<1) return 0;
 		*s = 0;
 		return s;
 	}
+	n--;
 
 	while (n) {
 		if (f->rpos != f->rend) {
diff --git a/src/stdio/fgetws.c b/src/stdio/fgetws.c
index b08b3049..195cb435 100644
--- a/src/stdio/fgetws.c
+++ b/src/stdio/fgetws.c
@@ -1,6 +1,5 @@
 #include "stdio_impl.h"
 #include <wchar.h>
-#include <errno.h>
 
 wint_t __fgetwc_unlocked(FILE *);
 
@@ -12,10 +11,6 @@ wchar_t *fgetws(wchar_t *restrict s, int n, FILE *restrict f)
 
 	FLOCK(f);
 
-	/* Setup a dummy errno so we can detect EILSEQ. This is
-	 * the only way to catch encoding errors in the form of a
-	 * partial character just before EOF. */
-	errno = EAGAIN;
 	for (; n; n--) {
 		wint_t c = __fgetwc_unlocked(f);
 		if (c == WEOF) break;
@@ -23,7 +18,7 @@ wchar_t *fgetws(wchar_t *restrict s, int n, FILE *restrict f)
 		if (c == '\n') break;
 	}
 	*p = 0;
-	if (ferror(f) || errno==EILSEQ) p = s;
+	if (ferror(f)) p = s;
 
 	FUNLOCK(f);
 
diff --git a/src/stdio/fmemopen.c b/src/stdio/fmemopen.c
index 5685092e..343e3e3f 100644
--- a/src/stdio/fmemopen.c
+++ b/src/stdio/fmemopen.c
@@ -2,6 +2,7 @@
 #include <errno.h>
 #include <string.h>
 #include <stdlib.h>
+#include <stddef.h>
 #include <inttypes.h>
 #include "libc.h"
 
@@ -95,18 +96,17 @@ FILE *fmemopen(void *restrict buf, size_t size, const char *restrict mode)
 
 	f = malloc(sizeof *f + (buf?0:size));
 	if (!f) return 0;
-	memset(&f->f, 0, sizeof f->f);
+	memset(f, 0, offsetof(struct mem_FILE, buf));
 	f->f.cookie = &f->c;
 	f->f.fd = -1;
 	f->f.lbf = EOF;
 	f->f.buf = f->buf + UNGET;
 	f->f.buf_size = sizeof f->buf - UNGET;
 	if (!buf) {
-		buf = f->buf2;;
+		buf = f->buf2;
 		memset(buf, 0, size);
 	}
 
-	memset(&f->c, 0, sizeof f->c);
 	f->c.buf = buf;
 	f->c.size = size;
 	f->c.mode = *mode;
diff --git a/src/stdio/fopen.c b/src/stdio/fopen.c
index e1b91e12..80bc341e 100644
--- a/src/stdio/fopen.c
+++ b/src/stdio/fopen.c
@@ -29,5 +29,3 @@ FILE *fopen(const char *restrict filename, const char *restrict mode)
 	__syscall(SYS_close, fd);
 	return 0;
 }
-
-weak_alias(fopen, fopen64);
diff --git a/src/stdio/freopen.c b/src/stdio/freopen.c
index 615d4b47..1641a4c5 100644
--- a/src/stdio/freopen.c
+++ b/src/stdio/freopen.c
@@ -40,6 +40,8 @@ FILE *freopen(const char *restrict filename, const char *restrict mode, FILE *re
 		fclose(f2);
 	}
 
+	f->mode = 0;
+	f->locale = 0;
 	FUNLOCK(f);
 	return f;
 
@@ -49,5 +51,3 @@ fail:
 	fclose(f);
 	return NULL;
 }
-
-weak_alias(freopen, freopen64);
diff --git a/src/stdio/fseek.c b/src/stdio/fseek.c
index 439308f7..c7425802 100644
--- a/src/stdio/fseek.c
+++ b/src/stdio/fseek.c
@@ -1,7 +1,14 @@
 #include "stdio_impl.h"
+#include <errno.h>
 
 int __fseeko_unlocked(FILE *f, off_t off, int whence)
 {
+	/* Fail immediately for invalid whence argument. */
+	if (whence != SEEK_CUR && whence != SEEK_SET && whence != SEEK_END) {
+		errno = EINVAL;
+		return -1;
+	}
+
 	/* Adjust relative offset for unread data in buffer, if any. */
 	if (whence == SEEK_CUR && f->rend) off -= f->rend - f->rpos;
 
@@ -39,5 +46,3 @@ int fseek(FILE *f, long off, int whence)
 }
 
 weak_alias(__fseeko, fseeko);
-
-weak_alias(fseeko, fseeko64);
diff --git a/src/stdio/fsetpos.c b/src/stdio/fsetpos.c
index 77ab8d82..779cb3cc 100644
--- a/src/stdio/fsetpos.c
+++ b/src/stdio/fsetpos.c
@@ -4,5 +4,3 @@ int fsetpos(FILE *f, const fpos_t *pos)
 {
 	return __fseeko(f, *(const long long *)pos, SEEK_SET);
 }
-
-weak_alias(fsetpos, fsetpos64);
diff --git a/src/stdio/ftell.c b/src/stdio/ftell.c
index 1a2afbbc..1e1a08d8 100644
--- a/src/stdio/ftell.c
+++ b/src/stdio/ftell.c
@@ -37,5 +37,3 @@ long ftell(FILE *f)
 }
 
 weak_alias(__ftello, ftello);
-
-weak_alias(ftello, ftello64);
diff --git a/src/stdio/getdelim.c b/src/stdio/getdelim.c
index d2f5b15a..df114441 100644
--- a/src/stdio/getdelim.c
+++ b/src/stdio/getdelim.c
@@ -55,9 +55,11 @@ ssize_t getdelim(char **restrict s, size_t *restrict n, int delim, FILE *restric
 			*s = tmp;
 			*n = m;
 		}
-		memcpy(*s+i, f->rpos, k);
-		f->rpos += k;
-		i += k;
+		if (k) {
+			memcpy(*s+i, f->rpos, k);
+			f->rpos += k;
+			i += k;
+		}
 		if (z) break;
 		if ((c = getc_unlocked(f)) == EOF) {
 			if (!i || !feof(f)) {
diff --git a/src/stdio/ofl.c b/src/stdio/ofl.c
index f2d3215a..aad3d171 100644
--- a/src/stdio/ofl.c
+++ b/src/stdio/ofl.c
@@ -1,8 +1,10 @@
 #include "stdio_impl.h"
 #include "lock.h"
+#include "fork_impl.h"
 
 static FILE *ofl_head;
 static volatile int ofl_lock[1];
+volatile int *const __stdio_ofl_lockptr = ofl_lock;
 
 FILE **__ofl_lock()
 {
diff --git a/src/stdio/open_wmemstream.c b/src/stdio/open_wmemstream.c
index ed1b561d..b8ae4a79 100644
--- a/src/stdio/open_wmemstream.c
+++ b/src/stdio/open_wmemstream.c
@@ -40,8 +40,12 @@ fail:
 static size_t wms_write(FILE *f, const unsigned char *buf, size_t len)
 {
 	struct cookie *c = f->cookie;
-	size_t len2;
+	size_t len2 = f->wpos - f->wbase;
 	wchar_t *newbuf;
+	if (len2) {
+		f->wpos = f->wbase;
+		if (wms_write(f, f->wbase, len2) < len2) return 0;
+	}
 	if (len + c->pos >= c->space) {
 		len2 = 2*c->space+1 | c->pos+len+1;
 		if (len2 > SSIZE_MAX/4) return 0;
diff --git a/src/stdio/pclose.c b/src/stdio/pclose.c
index 080a4262..c64da405 100644
--- a/src/stdio/pclose.c
+++ b/src/stdio/pclose.c
@@ -7,7 +7,7 @@ int pclose(FILE *f)
 	int status, r;
 	pid_t pid = f->pipe_pid;
 	fclose(f);
-	while ((r=__syscall(SYS_wait4, pid, &status, 0, 0)) == -EINTR);
+	while ((r=__sys_wait4(pid, &status, 0, 0)) == -EINTR);
 	if (r<0) return __syscall_ret(r);
 	return status;
 }
diff --git a/src/stdio/popen.c b/src/stdio/popen.c
index 92cb57ee..3ec83394 100644
--- a/src/stdio/popen.c
+++ b/src/stdio/popen.c
@@ -31,25 +31,12 @@ FILE *popen(const char *cmd, const char *mode)
 		__syscall(SYS_close, p[1]);
 		return NULL;
 	}
-	FLOCK(f);
-
-	/* If the child's end of the pipe happens to already be on the final
-	 * fd number to which it will be assigned (either 0 or 1), it must
-	 * be moved to a different fd. Otherwise, there is no safe way to
-	 * remove the close-on-exec flag in the child without also creating
-	 * a file descriptor leak race condition in the parent. */
-	if (p[1-op] == 1-op) {
-		int tmp = fcntl(1-op, F_DUPFD_CLOEXEC, 0);
-		if (tmp < 0) {
-			e = errno;
-			goto fail;
-		}
-		__syscall(SYS_close, p[1-op]);
-		p[1-op] = tmp;
-	}
 
 	e = ENOMEM;
 	if (!posix_spawn_file_actions_init(&fa)) {
+		for (FILE *l = *__ofl_lock(); l; l=l->next)
+			if (l->pipe_pid && posix_spawn_file_actions_addclose(&fa, l->fd))
+				goto fail;
 		if (!posix_spawn_file_actions_adddup2(&fa, p[1-op], 1-op)) {
 			if (!(e = posix_spawn(&pid, "/bin/sh", &fa, 0,
 			    (char *[]){ "sh", "-c", (char *)cmd, 0 }, __environ))) {
@@ -58,13 +45,14 @@ FILE *popen(const char *cmd, const char *mode)
 				if (!strchr(mode, 'e'))
 					fcntl(p[op], F_SETFD, 0);
 				__syscall(SYS_close, p[1-op]);
-				FUNLOCK(f);
+				__ofl_unlock();
 				return f;
 			}
 		}
+fail:
+		__ofl_unlock();
 		posix_spawn_file_actions_destroy(&fa);
 	}
-fail:
 	fclose(f);
 	__syscall(SYS_close, p[1-op]);
 
diff --git a/src/stdio/tempnam.c b/src/stdio/tempnam.c
index 84f91978..0c65b1f0 100644
--- a/src/stdio/tempnam.c
+++ b/src/stdio/tempnam.c
@@ -36,11 +36,10 @@ char *tempnam(const char *dir, const char *pfx)
 
 	for (try=0; try<MAXTRIES; try++) {
 		__randname(s+l-6);
-#ifdef SYS_lstat
-		r = __syscall(SYS_lstat, s, &(struct stat){0});
+#ifdef SYS_readlink
+		r = __syscall(SYS_readlink, s, (char[1]){0}, 1);
 #else
-		r = __syscall(SYS_fstatat, AT_FDCWD, s,
-			&(struct stat){0}, AT_SYMLINK_NOFOLLOW);
+		r = __syscall(SYS_readlinkat, AT_FDCWD, s, (char[1]){0}, 1);
 #endif
 		if (r == -ENOENT) return strdup(s);
 	}
diff --git a/src/stdio/tmpfile.c b/src/stdio/tmpfile.c
index ae493987..2fa8803f 100644
--- a/src/stdio/tmpfile.c
+++ b/src/stdio/tmpfile.c
@@ -27,5 +27,3 @@ FILE *tmpfile(void)
 	}
 	return 0;
 }
-
-weak_alias(tmpfile, tmpfile64);
diff --git a/src/stdio/tmpnam.c b/src/stdio/tmpnam.c
index 6c7c253a..71dc8bb1 100644
--- a/src/stdio/tmpnam.c
+++ b/src/stdio/tmpnam.c
@@ -16,11 +16,10 @@ char *tmpnam(char *buf)
 	int r;
 	for (try=0; try<MAXTRIES; try++) {
 		__randname(s+12);
-#ifdef SYS_lstat
-		r = __syscall(SYS_lstat, s, &(struct stat){0});
+#ifdef SYS_readlink
+		r = __syscall(SYS_readlink, s, (char[1]){0}, 1);
 #else
-		r = __syscall(SYS_fstatat, AT_FDCWD, s,
-			&(struct stat){0}, AT_SYMLINK_NOFOLLOW);
+		r = __syscall(SYS_readlinkat, AT_FDCWD, s, (char[1]){0}, 1);
 #endif
 		if (r == -ENOENT) return strcpy(buf ? buf : internal, s);
 	}
diff --git a/src/stdio/vdprintf.c b/src/stdio/vdprintf.c
index c35d9b4f..3b9c093b 100644
--- a/src/stdio/vdprintf.c
+++ b/src/stdio/vdprintf.c
@@ -1,14 +1,9 @@
 #include "stdio_impl.h"
 
-static size_t wrap_write(FILE *f, const unsigned char *buf, size_t len)
-{
-	return __stdio_write(f, buf, len);
-}
-
 int vdprintf(int fd, const char *restrict fmt, va_list ap)
 {
 	FILE f = {
-		.fd = fd, .lbf = EOF, .write = wrap_write,
+		.fd = fd, .lbf = EOF, .write = __stdio_write,
 		.buf = (void *)fmt, .buf_size = 0,
 		.lock = -1
 	};
diff --git a/src/stdio/vfprintf.c b/src/stdio/vfprintf.c
index 9b961e7f..76733997 100644
--- a/src/stdio/vfprintf.c
+++ b/src/stdio/vfprintf.c
@@ -52,7 +52,7 @@ static const unsigned char states[]['z'-'A'+1] = {
 		S('o') = UINT, S('u') = UINT, S('x') = UINT, S('X') = UINT,
 		S('e') = DBL, S('f') = DBL, S('g') = DBL, S('a') = DBL,
 		S('E') = DBL, S('F') = DBL, S('G') = DBL, S('A') = DBL,
-		S('c') = CHAR, S('C') = INT,
+		S('c') = INT, S('C') = UINT,
 		S('s') = PTR, S('S') = PTR, S('p') = UIPTR, S('n') = PTR,
 		S('m') = NOARG,
 		S('l') = LPRE, S('h') = HPRE, S('L') = BIGLPRE,
@@ -62,7 +62,7 @@ static const unsigned char states[]['z'-'A'+1] = {
 		S('o') = ULONG, S('u') = ULONG, S('x') = ULONG, S('X') = ULONG,
 		S('e') = DBL, S('f') = DBL, S('g') = DBL, S('a') = DBL,
 		S('E') = DBL, S('F') = DBL, S('G') = DBL, S('A') = DBL,
-		S('c') = INT, S('s') = PTR, S('n') = PTR,
+		S('c') = UINT, S('s') = PTR, S('n') = PTR,
 		S('l') = LLPRE,
 	}, { /* 2: ll-prefixed */
 		S('d') = LLONG, S('i') = LLONG,
@@ -132,7 +132,7 @@ static void pop_arg(union arg *arg, int type, va_list *ap)
 
 static void out(FILE *f, const char *s, size_t l)
 {
-	if (!(f->flags & F_ERR)) __fwritex((void *)s, l, f);
+	if (!ferror(f)) __fwritex((void *)s, l, f);
 }
 
 static void pad(FILE *f, char c, int w, int l, int fl)
@@ -166,7 +166,8 @@ static char *fmt_u(uintmax_t x, char *s)
 {
 	unsigned long y;
 	for (   ; x>ULONG_MAX; x/=10) *--s = '0' + x%10;
-	for (y=x;           y; y/=10) *--s = '0' + y%10;
+	for (y=x;       y>=10; y/=10) *--s = '0' + y%10;
+	if (y) *--s = '0' + y;
 	return s;
 }
 
@@ -177,10 +178,14 @@ static char *fmt_u(uintmax_t x, char *s)
 typedef char compiler_defines_long_double_incorrectly[9-(int)sizeof(long double)];
 #endif
 
-static int fmt_fp(FILE *f, long double y, int w, int p, int fl, int t)
+static int fmt_fp(FILE *f, long double y, int w, int p, int fl, int t, int ps)
 {
-	uint32_t big[(LDBL_MANT_DIG+28)/29 + 1          // mantissa expansion
-		+ (LDBL_MAX_EXP+LDBL_MANT_DIG+28+8)/9]; // exponent expansion
+	int bufsize = (ps==BIGLPRE)
+		? (LDBL_MANT_DIG+28)/29 + 1 +          // mantissa expansion
+		  (LDBL_MAX_EXP+LDBL_MANT_DIG+28+8)/9  // exponent expansion
+		: (DBL_MANT_DIG+28)/29 + 1 +
+		  (DBL_MAX_EXP+DBL_MANT_DIG+28+8)/9;
+	uint32_t big[bufsize];
 	uint32_t *a, *d, *r, *z;
 	int e2=0, e, i, j, l;
 	char buf[9+LDBL_MANT_DIG/4], *s;
@@ -211,18 +216,11 @@ static int fmt_fp(FILE *f, long double y, int w, int p, int fl, int t)
 	if (y) e2--;
 
 	if ((t|32)=='a') {
-		long double round = 8.0;
-		int re;
-
 		if (t&32) prefix += 9;
 		pl += 2;
 
-		if (p<0 || p>=LDBL_MANT_DIG/4-1) re=0;
-		else re=LDBL_MANT_DIG/4-1-p;
-
-		if (re) {
-			round *= 1<<(LDBL_MANT_DIG%4);
-			while (re--) round*=16;
+		if (p>=0 && p<(LDBL_MANT_DIG-1+3)/4) {
+			double round = scalbn(1, LDBL_MANT_DIG-1-(p*4));
 			if (*prefix=='-') {
 				y=-y;
 				y-=round;
@@ -437,7 +435,7 @@ static int printf_core(FILE *f, const char *fmt, va_list *ap, union arg *nl_arg,
 	unsigned st, ps;
 	int cnt=0, l=0;
 	size_t i;
-	char buf[sizeof(uintmax_t)*3+3+LDBL_MANT_DIG/4];
+	char buf[sizeof(uintmax_t)*3];
 	const char *prefix;
 	int t, pl;
 	wchar_t wc[2], *ws;
@@ -478,8 +476,8 @@ static int printf_core(FILE *f, const char *fmt, va_list *ap, union arg *nl_arg,
 		if (*s=='*') {
 			if (isdigit(s[1]) && s[2]=='$') {
 				l10n=1;
-				nl_type[s[1]-'0'] = INT;
-				w = nl_arg[s[1]-'0'].i;
+				if (!f) nl_type[s[1]-'0'] = INT, w = 0;
+				else w = nl_arg[s[1]-'0'].i;
 				s+=3;
 			} else if (!l10n) {
 				w = f ? va_arg(*ap, int) : 0;
@@ -491,8 +489,8 @@ static int printf_core(FILE *f, const char *fmt, va_list *ap, union arg *nl_arg,
 		/* Read precision */
 		if (*s=='.' && s[1]=='*') {
 			if (isdigit(s[2]) && s[3]=='$') {
-				nl_type[s[2]-'0'] = INT;
-				p = nl_arg[s[2]-'0'].i;
+				if (!f) nl_type[s[2]-'0'] = INT, p = 0;
+				else p = nl_arg[s[2]-'0'].i;
 				s+=4;
 			} else if (!l10n) {
 				p = f ? va_arg(*ap, int) : 0;
@@ -521,13 +519,18 @@ static int printf_core(FILE *f, const char *fmt, va_list *ap, union arg *nl_arg,
 		if (st==NOARG) {
 			if (argpos>=0) goto inval;
 		} else {
-			if (argpos>=0) nl_type[argpos]=st, arg=nl_arg[argpos];
-			else if (f) pop_arg(&arg, st, ap);
+			if (argpos>=0) {
+				if (!f) nl_type[argpos]=st;
+				else arg=nl_arg[argpos];
+			} else if (f) pop_arg(&arg, st, ap);
 			else return 0;
 		}
 
 		if (!f) continue;
 
+		/* Do not process any new directives once in error state. */
+		if (ferror(f)) return -1;
+
 		z = buf + sizeof(buf);
 		prefix = "-+   0X0x";
 		pl = 0;
@@ -558,11 +561,11 @@ static int printf_core(FILE *f, const char *fmt, va_list *ap, union arg *nl_arg,
 		case 'x': case 'X':
 			a = fmt_x(arg.i, z, t&32);
 			if (arg.i && (fl & ALT_FORM)) prefix+=(t>>4), pl=2;
-			if (0) {
+			goto ifmt_tail;
 		case 'o':
 			a = fmt_o(arg.i, z);
 			if ((fl&ALT_FORM) && p<z-a+1) p=z-a+1;
-			} if (0) {
+			goto ifmt_tail;
 		case 'd': case 'i':
 			pl=1;
 			if (arg.i>INTMAX_MAX) {
@@ -574,7 +577,7 @@ static int printf_core(FILE *f, const char *fmt, va_list *ap, union arg *nl_arg,
 			} else pl=0;
 		case 'u':
 			a = fmt_u(arg.i, z);
-			}
+		ifmt_tail:
 			if (xp && p<0) goto overflow;
 			if (xp) fl &= ~ZERO_PAD;
 			if (!arg.i && !p) {
@@ -583,6 +586,7 @@ static int printf_core(FILE *f, const char *fmt, va_list *ap, union arg *nl_arg,
 			}
 			p = MAX(p, z-a + !arg.i);
 			break;
+		narrow_c:
 		case 'c':
 			*(a=z-(p=1))=arg.i;
 			fl &= ~ZERO_PAD;
@@ -597,6 +601,7 @@ static int printf_core(FILE *f, const char *fmt, va_list *ap, union arg *nl_arg,
 			fl &= ~ZERO_PAD;
 			break;
 		case 'C':
+			if (!arg.i) goto narrow_c;
 			wc[0] = arg.i;
 			wc[1] = 0;
 			arg.p = wc;
@@ -617,7 +622,7 @@ static int printf_core(FILE *f, const char *fmt, va_list *ap, union arg *nl_arg,
 		case 'e': case 'f': case 'g': case 'a':
 		case 'E': case 'F': case 'G': case 'A':
 			if (xp && p<0) goto overflow;
-			l = fmt_fp(f, arg.f, w, p, fl, t);
+			l = fmt_fp(f, arg.f, w, p, fl, t, ps);
 			if (l<0) goto overflow;
 			continue;
 		}
@@ -672,7 +677,7 @@ int vfprintf(FILE *restrict f, const char *restrict fmt, va_list ap)
 
 	FLOCK(f);
 	olderr = f->flags & F_ERR;
-	if (f->mode < 1) f->flags &= ~F_ERR;
+	f->flags &= ~F_ERR;
 	if (!f->buf_size) {
 		saved_buf = f->buf;
 		f->buf = internal_buf;
@@ -688,7 +693,7 @@ int vfprintf(FILE *restrict f, const char *restrict fmt, va_list ap)
 		f->buf_size = 0;
 		f->wpos = f->wbase = f->wend = 0;
 	}
-	if (f->flags & F_ERR) ret = -1;
+	if (ferror(f)) ret = -1;
 	f->flags |= olderr;
 	FUNLOCK(f);
 	va_end(ap2);
diff --git a/src/stdio/vfscanf.c b/src/stdio/vfscanf.c
index 9e030fc4..b78a374d 100644
--- a/src/stdio/vfscanf.c
+++ b/src/stdio/vfscanf.c
@@ -57,7 +57,7 @@ int vfscanf(FILE *restrict f, const char *restrict fmt, va_list ap)
 {
 	int width;
 	int size;
-	int alloc;
+	int alloc = 0;
 	int base;
 	const unsigned char *p;
 	int c, t;
@@ -76,6 +76,9 @@ int vfscanf(FILE *restrict f, const char *restrict fmt, va_list ap)
 
 	FLOCK(f);
 
+	if (!f->rpos) __toread(f);
+	if (!f->rpos) goto input_fail;
+
 	for (p=(const unsigned char *)fmt; *p; p++) {
 
 		alloc = 0;
diff --git a/src/stdio/vfwprintf.c b/src/stdio/vfwprintf.c
index 85b036c3..59d5471b 100644
--- a/src/stdio/vfwprintf.c
+++ b/src/stdio/vfwprintf.c
@@ -45,7 +45,7 @@ static const unsigned char states[]['z'-'A'+1] = {
 		S('o') = UINT, S('u') = UINT, S('x') = UINT, S('X') = UINT,
 		S('e') = DBL, S('f') = DBL, S('g') = DBL, S('a') = DBL,
 		S('E') = DBL, S('F') = DBL, S('G') = DBL, S('A') = DBL,
-		S('c') = CHAR, S('C') = INT,
+		S('c') = INT, S('C') = UINT,
 		S('s') = PTR, S('S') = PTR, S('p') = UIPTR, S('n') = PTR,
 		S('m') = NOARG,
 		S('l') = LPRE, S('h') = HPRE, S('L') = BIGLPRE,
@@ -55,7 +55,7 @@ static const unsigned char states[]['z'-'A'+1] = {
 		S('o') = ULONG, S('u') = ULONG, S('x') = ULONG, S('X') = ULONG,
 		S('e') = DBL, S('f') = DBL, S('g') = DBL, S('a') = DBL,
 		S('E') = DBL, S('F') = DBL, S('G') = DBL, S('A') = DBL,
-		S('c') = INT, S('s') = PTR, S('n') = PTR,
+		S('c') = UINT, S('s') = PTR, S('n') = PTR,
 		S('l') = LLPRE,
 	}, { /* 2: ll-prefixed */
 		S('d') = LLONG, S('i') = LLONG,
@@ -125,7 +125,13 @@ static void pop_arg(union arg *arg, int type, va_list *ap)
 
 static void out(FILE *f, const wchar_t *s, size_t l)
 {
-	while (l-- && !(f->flags & F_ERR)) fputwc(*s++, f);
+	while (l-- && !ferror(f)) fputwc(*s++, f);
+}
+
+static void pad(FILE *f, int n, int fl)
+{
+	if ((fl & LEFT_ADJ) || !n || ferror(f)) return;
+	fprintf(f, "%*s", n, "");
 }
 
 static int getint(wchar_t **s) {
@@ -242,6 +248,10 @@ static int wprintf_core(FILE *f, const wchar_t *fmt, va_list *ap, union arg *nl_
 		}
 
 		if (!f) continue;
+
+		/* Do not process any new directives once in error state. */
+		if (ferror(f)) return -1;
+
 		t = s[-1];
 		if (ps && (t&15)==3) t&=~32;
 
@@ -258,25 +268,22 @@ static int wprintf_core(FILE *f, const wchar_t *fmt, va_list *ap, union arg *nl_
 			}
 			continue;
 		case 'c':
+		case 'C':
 			if (w<1) w=1;
-			if (w>1 && !(fl&LEFT_ADJ)) fprintf(f, "%*s", w-1, "");
-			fputwc(btowc(arg.i), f);
-			if (w>1 && (fl&LEFT_ADJ)) fprintf(f, "%*s", w-1, "");
+			pad(f, w-1, fl);
+			out(f, &(wchar_t){t=='C' ? arg.i : btowc(arg.i)}, 1);
+			pad(f, w-1, fl^LEFT_ADJ);
 			l = w;
 			continue;
-		case 'C':
-			fputwc(arg.i, f);
-			l = 1;
-			continue;
 		case 'S':
 			a = arg.p;
 			z = a + wcsnlen(a, p<0 ? INT_MAX : p);
 			if (p<0 && *z) goto overflow;
 			p = z-a;
 			if (w<p) w=p;
-			if (!(fl&LEFT_ADJ)) fprintf(f, "%*s", w-p, "");
+			pad(f, w-p, fl);
 			out(f, a, p);
-			if ((fl&LEFT_ADJ)) fprintf(f, "%*s", w-p, "");
+			pad(f, w-p, fl^LEFT_ADJ);
 			l=w;
 			continue;
 		case 'm':
@@ -289,14 +296,14 @@ static int wprintf_core(FILE *f, const wchar_t *fmt, va_list *ap, union arg *nl_
 			if (p<0 && *bs) goto overflow;
 			p=l;
 			if (w<p) w=p;
-			if (!(fl&LEFT_ADJ)) fprintf(f, "%*s", w-p, "");
+			pad(f, w-p, fl);
 			bs = arg.p;
 			while (l--) {
 				i=mbtowc(&wc, bs, MB_LEN_MAX);
 				bs+=i;
-				fputwc(wc, f);
+				out(f, &wc, 1);
 			}
-			if ((fl&LEFT_ADJ)) fprintf(f, "%*s", w-p, "");
+			pad(f, w-p, fl^LEFT_ADJ);
 			l=w;
 			continue;
 		}
@@ -340,8 +347,8 @@ overflow:
 int vfwprintf(FILE *restrict f, const wchar_t *restrict fmt, va_list ap)
 {
 	va_list ap2;
-	int nl_type[NL_ARGMAX] = {0};
-	union arg nl_arg[NL_ARGMAX];
+	int nl_type[NL_ARGMAX+1] = {0};
+	union arg nl_arg[NL_ARGMAX+1];
 	int olderr;
 	int ret;
 
@@ -357,7 +364,7 @@ int vfwprintf(FILE *restrict f, const wchar_t *restrict fmt, va_list ap)
 	olderr = f->flags & F_ERR;
 	f->flags &= ~F_ERR;
 	ret = wprintf_core(f, fmt, &ap2, nl_arg, nl_type);
-	if (f->flags & F_ERR) ret = -1;
+	if (ferror(f)) ret = -1;
 	f->flags |= olderr;
 	FUNLOCK(f);
 	va_end(ap2);
diff --git a/src/stdio/vsnprintf.c b/src/stdio/vsnprintf.c
index b3510a63..409b9c85 100644
--- a/src/stdio/vsnprintf.c
+++ b/src/stdio/vsnprintf.c
@@ -45,11 +45,6 @@ int vsnprintf(char *restrict s, size_t n, const char *restrict fmt, va_list ap)
 		.cookie = &c,
 	};
 
-	if (n > INT_MAX) {
-		errno = EOVERFLOW;
-		return -1;
-	}
-
 	*c.s = 0;
 	return vfprintf(&f, fmt, ap);
 }
diff --git a/src/stdio/vsscanf.c b/src/stdio/vsscanf.c
index 98500225..4d6d259b 100644
--- a/src/stdio/vsscanf.c
+++ b/src/stdio/vsscanf.c
@@ -1,15 +1,25 @@
 #include "stdio_impl.h"
+#include <string.h>
 
-static size_t do_read(FILE *f, unsigned char *buf, size_t len)
+static size_t string_read(FILE *f, unsigned char *buf, size_t len)
 {
-	return __string_read(f, buf, len);
+	char *src = f->cookie;
+	size_t k = len+256;
+	char *end = memchr(src, 0, k);
+	if (end) k = end-src;
+	if (k < len) len = k;
+	memcpy(buf, src, len);
+	f->rpos = (void *)(src+len);
+	f->rend = (void *)(src+k);
+	f->cookie = src+k;
+	return len;
 }
 
 int vsscanf(const char *restrict s, const char *restrict fmt, va_list ap)
 {
 	FILE f = {
 		.buf = (void *)s, .cookie = (void *)s,
-		.read = do_read, .lock = -1
+		.read = string_read, .lock = -1
 	};
 	return vfscanf(&f, fmt, ap);
 }
diff --git a/src/stdio/vswprintf.c b/src/stdio/vswprintf.c
index 7f98c5c9..5e9a4dad 100644
--- a/src/stdio/vswprintf.c
+++ b/src/stdio/vswprintf.c
@@ -18,6 +18,7 @@ static size_t sw_write(FILE *f, const unsigned char *s, size_t l)
 	if (s!=f->wbase && sw_write(f, f->wbase, f->wpos-f->wbase)==-1)
 		return -1;
 	while (c->l && l && (i=mbtowc(c->ws, (void *)s, l))>=0) {
+		if (!i) i=1;
 		s+=i;
 		l-=i;
 		c->l--;
@@ -50,9 +51,6 @@ int vswprintf(wchar_t *restrict s, size_t n, const wchar_t *restrict fmt, va_lis
 
 	if (!n) {
 		return -1;
-	} else if (n > INT_MAX) {
-		errno = EOVERFLOW;
-		return -1;
 	}
 	r = vfwprintf(&f, fmt, ap);
 	sw_write(&f, 0, 0);
diff --git a/src/stdlib/qsort.c b/src/stdlib/qsort.c
index da58fd31..ab79dc6f 100644
--- a/src/stdlib/qsort.c
+++ b/src/stdlib/qsort.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011 by Valentin Ochs
+/* Copyright (C) 2011 by Lynn Ochs
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -24,6 +24,7 @@
 /* Smoothsort, an adaptive variant of Heapsort.  Memory usage: O(1).
    Run time: Worst case O(n log n), close to O(n) in the mostly-sorted case. */
 
+#define _BSD_SOURCE
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -31,7 +32,7 @@
 #include "atomic.h"
 #define ntz(x) a_ctz_l((x))
 
-typedef int (*cmpfun)(const void *, const void *);
+typedef int (*cmpfun)(const void *, const void *, void *);
 
 static inline int pntz(size_t p[2]) {
 	int r = ntz(p[0] - 1);
@@ -88,7 +89,7 @@ static inline void shr(size_t p[2], int n)
 	p[1] >>= n;
 }
 
-static void sift(unsigned char *head, size_t width, cmpfun cmp, int pshift, size_t lp[])
+static void sift(unsigned char *head, size_t width, cmpfun cmp, void *arg, int pshift, size_t lp[])
 {
 	unsigned char *rt, *lf;
 	unsigned char *ar[14 * sizeof(size_t) + 1];
@@ -99,10 +100,10 @@ static void sift(unsigned char *head, size_t width, cmpfun cmp, int pshift, size
 		rt = head - width;
 		lf = head - width - lp[pshift - 2];
 
-		if((*cmp)(ar[0], lf) >= 0 && (*cmp)(ar[0], rt) >= 0) {
+		if(cmp(ar[0], lf, arg) >= 0 && cmp(ar[0], rt, arg) >= 0) {
 			break;
 		}
-		if((*cmp)(lf, rt) >= 0) {
+		if(cmp(lf, rt, arg) >= 0) {
 			ar[i++] = lf;
 			head = lf;
 			pshift -= 1;
@@ -115,7 +116,7 @@ static void sift(unsigned char *head, size_t width, cmpfun cmp, int pshift, size
 	cycle(width, ar, i);
 }
 
-static void trinkle(unsigned char *head, size_t width, cmpfun cmp, size_t pp[2], int pshift, int trusty, size_t lp[])
+static void trinkle(unsigned char *head, size_t width, cmpfun cmp, void *arg, size_t pp[2], int pshift, int trusty, size_t lp[])
 {
 	unsigned char *stepson,
 	              *rt, *lf;
@@ -130,13 +131,13 @@ static void trinkle(unsigned char *head, size_t width, cmpfun cmp, size_t pp[2],
 	ar[0] = head;
 	while(p[0] != 1 || p[1] != 0) {
 		stepson = head - lp[pshift];
-		if((*cmp)(stepson, ar[0]) <= 0) {
+		if(cmp(stepson, ar[0], arg) <= 0) {
 			break;
 		}
 		if(!trusty && pshift > 1) {
 			rt = head - width;
 			lf = head - width - lp[pshift - 2];
-			if((*cmp)(rt, stepson) >= 0 || (*cmp)(lf, stepson) >= 0) {
+			if(cmp(rt, stepson, arg) >= 0 || cmp(lf, stepson, arg) >= 0) {
 				break;
 			}
 		}
@@ -150,11 +151,11 @@ static void trinkle(unsigned char *head, size_t width, cmpfun cmp, size_t pp[2],
 	}
 	if(!trusty) {
 		cycle(width, ar, i);
-		sift(head, width, cmp, pshift, lp);
+		sift(head, width, cmp, arg, pshift, lp);
 	}
 }
 
-void qsort(void *base, size_t nel, size_t width, cmpfun cmp)
+void __qsort_r(void *base, size_t nel, size_t width, cmpfun cmp, void *arg)
 {
 	size_t lp[12*sizeof(size_t)];
 	size_t i, size = width * nel;
@@ -173,16 +174,16 @@ void qsort(void *base, size_t nel, size_t width, cmpfun cmp)
 
 	while(head < high) {
 		if((p[0] & 3) == 3) {
-			sift(head, width, cmp, pshift, lp);
+			sift(head, width, cmp, arg, pshift, lp);
 			shr(p, 2);
 			pshift += 2;
 		} else {
 			if(lp[pshift - 1] >= high - head) {
-				trinkle(head, width, cmp, p, pshift, 0, lp);
+				trinkle(head, width, cmp, arg, p, pshift, 0, lp);
 			} else {
-				sift(head, width, cmp, pshift, lp);
+				sift(head, width, cmp, arg, pshift, lp);
 			}
-			
+
 			if(pshift == 1) {
 				shl(p, 1);
 				pshift = 0;
@@ -191,12 +192,12 @@ void qsort(void *base, size_t nel, size_t width, cmpfun cmp)
 				pshift = 1;
 			}
 		}
-		
+
 		p[0] |= 1;
 		head += width;
 	}
 
-	trinkle(head, width, cmp, p, pshift, 0, lp);
+	trinkle(head, width, cmp, arg, p, pshift, 0, lp);
 
 	while(pshift != 1 || p[0] != 1 || p[1] != 0) {
 		if(pshift <= 1) {
@@ -208,11 +209,13 @@ void qsort(void *base, size_t nel, size_t width, cmpfun cmp)
 			pshift -= 2;
 			p[0] ^= 7;
 			shr(p, 1);
-			trinkle(head - lp[pshift] - width, width, cmp, p, pshift + 1, 1, lp);
+			trinkle(head - lp[pshift] - width, width, cmp, arg, p, pshift + 1, 1, lp);
 			shl(p, 1);
 			p[0] |= 1;
-			trinkle(head - width, width, cmp, p, pshift, 1, lp);
+			trinkle(head - width, width, cmp, arg, p, pshift, 1, lp);
 		}
 		head -= width;
 	}
 }
+
+weak_alias(__qsort_r, qsort_r);
diff --git a/src/stdlib/qsort_nr.c b/src/stdlib/qsort_nr.c
new file mode 100644
index 00000000..8ffe71d0
--- /dev/null
+++ b/src/stdlib/qsort_nr.c
@@ -0,0 +1,14 @@
+#define _BSD_SOURCE
+#include <stdlib.h>
+
+typedef int (*cmpfun)(const void *, const void *);
+
+static int wrapper_cmp(const void *v1, const void *v2, void *cmp)
+{
+	return ((cmpfun)cmp)(v1, v2);
+}
+
+void qsort(void *base, size_t nel, size_t width, cmpfun cmp)
+{
+	__qsort_r(base, nel, width, wrapper_cmp, (void *)cmp);
+}
diff --git a/src/stdlib/strtod.c b/src/stdlib/strtod.c
index a5d0118a..39b9daad 100644
--- a/src/stdlib/strtod.c
+++ b/src/stdlib/strtod.c
@@ -28,10 +28,3 @@ long double strtold(const char *restrict s, char **restrict p)
 {
 	return strtox(s, p, 2);
 }
-
-weak_alias(strtof, strtof_l);
-weak_alias(strtod, strtod_l);
-weak_alias(strtold, strtold_l);
-weak_alias(strtof, __strtof_l);
-weak_alias(strtod, __strtod_l);
-weak_alias(strtold, __strtold_l);
diff --git a/src/stdlib/wcstod.c b/src/stdlib/wcstod.c
index 26fe9af8..0deb7010 100644
--- a/src/stdlib/wcstod.c
+++ b/src/stdlib/wcstod.c
@@ -33,8 +33,7 @@ static long double wcstox(const wchar_t *s, wchar_t **p, int prec)
 	unsigned char buf[64];
 	FILE f = {0};
 	f.flags = 0;
-	f.rpos = f.rend = 0;
-	f.buf = buf + 4;
+	f.rpos = f.rend = f.buf = buf + 4;
 	f.buf_size = sizeof buf - 4;
 	f.lock = -1;
 	f.read = do_read;
diff --git a/src/stdlib/wcstol.c b/src/stdlib/wcstol.c
index 4443f577..1eeb495f 100644
--- a/src/stdlib/wcstol.c
+++ b/src/stdlib/wcstol.c
@@ -35,8 +35,7 @@ static unsigned long long wcstox(const wchar_t *s, wchar_t **p, int base, unsign
 	unsigned char buf[64];
 	FILE f = {0};
 	f.flags = 0;
-	f.rpos = f.rend = 0;
-	f.buf = buf + 4;
+	f.rpos = f.rend = f.buf = buf + 4;
 	f.buf_size = sizeof buf - 4;
 	f.lock = -1;
 	f.read = do_read;
diff --git a/src/string/aarch64/memcpy.S b/src/string/aarch64/memcpy.S
new file mode 100644
index 00000000..48bb8a8d
--- /dev/null
+++ b/src/string/aarch64/memcpy.S
@@ -0,0 +1,186 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin   x0
+#define src     x1
+#define count   x2
+#define dst     x3
+#define srcend  x4
+#define dstend  x5
+#define A_l     x6
+#define A_lw    w6
+#define A_h     x7
+#define B_l     x8
+#define B_lw    w8
+#define B_h     x9
+#define C_l     x10
+#define C_lw    w10
+#define C_h     x11
+#define D_l     x12
+#define D_h     x13
+#define E_l     x14
+#define E_h     x15
+#define F_l     x16
+#define F_h     x17
+#define G_l     count
+#define G_h     dst
+#define H_l     src
+#define H_h     srcend
+#define tmp1    x14
+
+/* This implementation of memcpy uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+.global memcpy
+.type memcpy,%function
+memcpy:
+	add     srcend, src, count
+	add     dstend, dstin, count
+	cmp     count, 128
+	b.hi    .Lcopy_long
+	cmp     count, 32
+	b.hi    .Lcopy32_128
+
+	/* Small copies: 0..32 bytes.  */
+	cmp     count, 16
+	b.lo    .Lcopy16
+	ldp     A_l, A_h, [src]
+	ldp     D_l, D_h, [srcend, -16]
+	stp     A_l, A_h, [dstin]
+	stp     D_l, D_h, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+.Lcopy16:
+	tbz     count, 3, .Lcopy8
+	ldr     A_l, [src]
+	ldr     A_h, [srcend, -8]
+	str     A_l, [dstin]
+	str     A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+.Lcopy8:
+	tbz     count, 2, .Lcopy4
+	ldr     A_lw, [src]
+	ldr     B_lw, [srcend, -4]
+	str     A_lw, [dstin]
+	str     B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+.Lcopy4:
+	cbz     count, .Lcopy0
+	lsr     tmp1, count, 1
+	ldrb    A_lw, [src]
+	ldrb    C_lw, [srcend, -1]
+	ldrb    B_lw, [src, tmp1]
+	strb    A_lw, [dstin]
+	strb    B_lw, [dstin, tmp1]
+	strb    C_lw, [dstend, -1]
+.Lcopy0:
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+.Lcopy32_128:
+	ldp     A_l, A_h, [src]
+	ldp     B_l, B_h, [src, 16]
+	ldp     C_l, C_h, [srcend, -32]
+	ldp     D_l, D_h, [srcend, -16]
+	cmp     count, 64
+	b.hi    .Lcopy128
+	stp     A_l, A_h, [dstin]
+	stp     B_l, B_h, [dstin, 16]
+	stp     C_l, C_h, [dstend, -32]
+	stp     D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+.Lcopy128:
+	ldp     E_l, E_h, [src, 32]
+	ldp     F_l, F_h, [src, 48]
+	cmp     count, 96
+	b.ls    .Lcopy96
+	ldp     G_l, G_h, [srcend, -64]
+	ldp     H_l, H_h, [srcend, -48]
+	stp     G_l, G_h, [dstend, -64]
+	stp     H_l, H_h, [dstend, -48]
+.Lcopy96:
+	stp     A_l, A_h, [dstin]
+	stp     B_l, B_h, [dstin, 16]
+	stp     E_l, E_h, [dstin, 32]
+	stp     F_l, F_h, [dstin, 48]
+	stp     C_l, C_h, [dstend, -32]
+	stp     D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy more than 128 bytes.  */
+.Lcopy_long:
+
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+	ldp     D_l, D_h, [src]
+	and     tmp1, dstin, 15
+	bic     dst, dstin, 15
+	sub     src, src, tmp1
+	add     count, count, tmp1      /* Count is now 16 too large.  */
+	ldp     A_l, A_h, [src, 16]
+	stp     D_l, D_h, [dstin]
+	ldp     B_l, B_h, [src, 32]
+	ldp     C_l, C_h, [src, 48]
+	ldp     D_l, D_h, [src, 64]!
+	subs    count, count, 128 + 16  /* Test and readjust count.  */
+	b.ls    .Lcopy64_from_end
+
+.Lloop64:
+	stp     A_l, A_h, [dst, 16]
+	ldp     A_l, A_h, [src, 16]
+	stp     B_l, B_h, [dst, 32]
+	ldp     B_l, B_h, [src, 32]
+	stp     C_l, C_h, [dst, 48]
+	ldp     C_l, C_h, [src, 48]
+	stp     D_l, D_h, [dst, 64]!
+	ldp     D_l, D_h, [src, 64]!
+	subs    count, count, 64
+	b.hi    .Lloop64
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+.Lcopy64_from_end:
+	ldp     E_l, E_h, [srcend, -64]
+	stp     A_l, A_h, [dst, 16]
+	ldp     A_l, A_h, [srcend, -48]
+	stp     B_l, B_h, [dst, 32]
+	ldp     B_l, B_h, [srcend, -32]
+	stp     C_l, C_h, [dst, 48]
+	ldp     C_l, C_h, [srcend, -16]
+	stp     D_l, D_h, [dst, 64]
+	stp     E_l, E_h, [dstend, -64]
+	stp     A_l, A_h, [dstend, -48]
+	stp     B_l, B_h, [dstend, -32]
+	stp     C_l, C_h, [dstend, -16]
+	ret
+
+.size memcpy,.-memcpy
diff --git a/src/string/aarch64/memset.S b/src/string/aarch64/memset.S
new file mode 100644
index 00000000..f0d29b7f
--- /dev/null
+++ b/src/string/aarch64/memset.S
@@ -0,0 +1,115 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#define dstin   x0
+#define val     x1
+#define valw    w1
+#define count   x2
+#define dst     x3
+#define dstend  x4
+#define zva_val x5
+
+.global memset
+.type memset,%function
+memset:
+
+	dup     v0.16B, valw
+	add     dstend, dstin, count
+
+	cmp     count, 96
+	b.hi    .Lset_long
+	cmp     count, 16
+	b.hs    .Lset_medium
+	mov     val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz     count, 3, 1f
+	str     val, [dstin]
+	str     val, [dstend, -8]
+	ret
+	nop
+1:      tbz     count, 2, 2f
+	str     valw, [dstin]
+	str     valw, [dstend, -4]
+	ret
+2:      cbz     count, 3f
+	strb    valw, [dstin]
+	tbz     count, 1, 3f
+	strh    valw, [dstend, -2]
+3:      ret
+
+	/* Set 17..96 bytes.  */
+.Lset_medium:
+	str     q0, [dstin]
+	tbnz    count, 6, .Lset96
+	str     q0, [dstend, -16]
+	tbz     count, 5, 1f
+	str     q0, [dstin, 16]
+	str     q0, [dstend, -32]
+1:      ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+.Lset96:
+	str     q0, [dstin, 16]
+	stp     q0, q0, [dstin, 32]
+	stp     q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+.Lset_long:
+	and     valw, valw, 255
+	bic     dst, dstin, 15
+	str     q0, [dstin]
+	cmp     count, 160
+	ccmp    valw, 0, 0, hs
+	b.ne    .Lno_zva
+
+#ifndef SKIP_ZVA_CHECK
+	mrs     zva_val, dczid_el0
+	and     zva_val, zva_val, 31
+	cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
+	b.ne    .Lno_zva
+#endif
+	str     q0, [dst, 16]
+	stp     q0, q0, [dst, 32]
+	bic     dst, dst, 63
+	sub     count, dstend, dst      /* Count is now 64 too large.  */
+	sub     count, count, 128       /* Adjust count and bias for loop.  */
+
+	.p2align 4
+.Lzva_loop:
+	add     dst, dst, 64
+	dc      zva, dst
+	subs    count, count, 64
+	b.hi    .Lzva_loop
+	stp     q0, q0, [dstend, -64]
+	stp     q0, q0, [dstend, -32]
+	ret
+
+.Lno_zva:
+	sub     count, dstend, dst      /* Count is 16 too large.  */
+	sub     dst, dst, 16            /* Dst is biased by -32.  */
+	sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
+.Lno_zva_loop:
+	stp     q0, q0, [dst, 32]
+	stp     q0, q0, [dst, 64]!
+	subs    count, count, 64
+	b.hi    .Lno_zva_loop
+	stp     q0, q0, [dstend, -64]
+	stp     q0, q0, [dstend, -32]
+	ret
+
+.size memset,.-memset
+
diff --git a/src/string/arm/memcpy_le.S b/src/string/arm/memcpy.S
index 7b35d305..869e3448 100644
--- a/src/string/arm/memcpy_le.S
+++ b/src/string/arm/memcpy.S
@@ -1,5 +1,3 @@
-#if !__ARMEB__
-
 /*
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
@@ -42,7 +40,7 @@
  * code safely callable from thumb mode, adjusting the return
  * instructions to be compatible with pre-thumb ARM cpus, removal of
  * prefetch code that is not compatible with older cpus and support for
- * building as thumb 2.
+ * building as thumb 2 and big-endian.
  */
 
 .syntax unified
@@ -227,24 +225,45 @@ non_congruent:
 	 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
 	 */
 	movs    r5, r5, lsl #31
+
+#if __ARMEB__
+	movmi   r3, r3, ror #24
+	strbmi	r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs	r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs	r3, [r0], #1
+#else
 	strbmi r3, [r0], #1
 	movmi   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
+#endif
 
 	cmp     r2, #4
 	blo     partial_word_tail
 
+#if __ARMEB__
+	mov	r3, r3, lsr r12
+	mov	r3, r3, lsl r12
+#endif
+
 	/* Align destination to 32 bytes (cache line boundary) */
 1:      tst     r0, #0x1c
 	beq     2f
 	ldr     r5, [r1], #4
 	sub     r2, r2, #4
+#if __ARMEB__
+	mov     r4, r5,                 lsr lr
+	orr     r4, r4, r3
+	mov     r3, r5,                 lsl r12
+#else
 	mov     r4, r5,                 lsl lr
 	orr     r4, r4, r3
 	mov     r3, r5,                 lsr r12
+#endif
 	str     r4, [r0], #4
 	cmp     r2, #4
 	bhs     1b
@@ -270,6 +289,25 @@ loop16:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #16
+	mov     r4, r4, lsl #16
+	orr     r4, r4, r5, lsr #16
+	mov     r5, r5, lsl #16
+	orr     r5, r5, r6, lsr #16
+	mov     r6, r6, lsl #16
+	orr     r6, r6, r7, lsr #16
+	mov     r7, r7, lsl #16
+	orr     r7, r7, r8, lsr #16
+	mov     r8, r8, lsl #16
+	orr     r8, r8, r9, lsr #16
+	mov     r9, r9, lsl #16
+	orr     r9, r9, r10, lsr #16
+	mov     r10, r10,               lsl #16
+	orr     r10, r10, r11, lsr #16
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #16
+#else
 	orr     r3, r3, r4, lsl #16
 	mov     r4, r4, lsr #16
 	orr     r4, r4, r5, lsl #16
@@ -287,6 +325,7 @@ loop16:
 	orr     r10, r10, r11, lsl #16
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #16
+#endif
 	bhs     1b
 	b       less_than_thirtytwo
 
@@ -296,6 +335,25 @@ loop8:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #24
+	mov     r4, r4, lsl #8
+	orr     r4, r4, r5, lsr #24
+	mov     r5, r5, lsl #8
+	orr     r5, r5, r6, lsr #24
+	mov     r6, r6,  lsl #8
+	orr     r6, r6, r7, lsr #24
+	mov     r7, r7,  lsl #8
+	orr     r7, r7, r8,             lsr #24
+	mov     r8, r8,  lsl #8
+	orr     r8, r8, r9,             lsr #24
+	mov     r9, r9,  lsl #8
+	orr     r9, r9, r10,    lsr #24
+	mov     r10, r10, lsl #8
+	orr     r10, r10, r11,  lsr #24
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #8
+#else
 	orr     r3, r3, r4, lsl #24
 	mov     r4, r4, lsr #8
 	orr     r4, r4, r5, lsl #24
@@ -313,6 +371,7 @@ loop8:
 	orr     r10, r10, r11,  lsl #24
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #8
+#endif
 	bhs     1b
 	b       less_than_thirtytwo
 
@@ -322,6 +381,25 @@ loop24:
 	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
 	subs    r2, r2, #32
 	ldrhs   r12, [r1], #4
+#if __ARMEB__
+	orr     r3, r3, r4, lsr #8
+	mov     r4, r4, lsl #24
+	orr     r4, r4, r5, lsr #8
+	mov     r5, r5, lsl #24
+	orr     r5, r5, r6, lsr #8
+	mov     r6, r6, lsl #24
+	orr     r6, r6, r7, lsr #8
+	mov     r7, r7, lsl #24
+	orr     r7, r7, r8, lsr #8
+	mov     r8, r8, lsl #24
+	orr     r8, r8, r9, lsr #8
+	mov     r9, r9, lsl #24
+	orr     r9, r9, r10, lsr #8
+	mov     r10, r10, lsl #24
+	orr     r10, r10, r11, lsr #8
+	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+	mov     r3, r11, lsl #24
+#else
 	orr     r3, r3, r4, lsl #8
 	mov     r4, r4, lsr #24
 	orr     r4, r4, r5, lsl #8
@@ -339,6 +417,7 @@ loop24:
 	orr     r10, r10, r11, lsl #8
 	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
 	mov     r3, r11, lsr #24
+#endif
 	bhs     1b
 
 less_than_thirtytwo:
@@ -350,9 +429,15 @@ less_than_thirtytwo:
 
 1:      ldr     r5, [r1], #4
 	sub     r2, r2, #4
+#if __ARMEB__
+	mov     r4, r5,                 lsr lr
+	orr     r4, r4, r3
+	mov     r3,     r5,                     lsl r12
+#else
 	mov     r4, r5,                 lsl lr
 	orr     r4, r4, r3
 	mov     r3,     r5,                     lsr r12
+#endif
 	str     r4, [r0], #4
 	cmp     r2, #4
 	bhs     1b
@@ -360,11 +445,20 @@ less_than_thirtytwo:
 partial_word_tail:
 	/* we have a partial word in the input buffer */
 	movs    r5, lr, lsl #(31-3)
+#if __ARMEB__
+	movmi   r3, r3, ror #24
+	strbmi r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs r3, [r0], #1
+	movcs   r3, r3, ror #24
+	strbcs r3, [r0], #1
+#else
 	strbmi r3, [r0], #1
 	movmi   r3, r3, lsr #8
 	strbcs r3, [r0], #1
 	movcs   r3, r3, lsr #8
 	strbcs r3, [r0], #1
+#endif
 
 	/* Refill spilled registers from the stack. Don't update sp. */
 	ldmfd   sp, {r5-r11}
@@ -383,4 +477,3 @@ copy_last_3_and_return:
 	ldmfd   sp!, {r0, r4, lr}
 	bx      lr
 
-#endif
diff --git a/src/string/arm/memcpy.c b/src/string/arm/memcpy.c
deleted file mode 100644
index 041614f4..00000000
--- a/src/string/arm/memcpy.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#if __ARMEB__
-#include "../memcpy.c"
-#endif
diff --git a/src/string/memccpy.c b/src/string/memccpy.c
index 00c18e2b..3b0a3700 100644
--- a/src/string/memccpy.c
+++ b/src/string/memccpy.c
@@ -29,6 +29,6 @@ void *memccpy(void *restrict dest, const void *restrict src, int c, size_t n)
 #endif
 	for (; n && (*d=*s)!=c; n--, s++, d++);
 tail:
-	if (n && *s==c) return d+1;
+	if (n) return d+1;
 	return 0;
 }
diff --git a/src/string/memmem.c b/src/string/memmem.c
index 58a21fcd..11eff86e 100644
--- a/src/string/memmem.c
+++ b/src/string/memmem.c
@@ -12,8 +12,8 @@ static char *twobyte_memmem(const unsigned char *h, size_t k, const unsigned cha
 
 static char *threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
 {
-	uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8;
-	uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8;
+	uint32_t nw = (uint32_t)n[0]<<24 | n[1]<<16 | n[2]<<8;
+	uint32_t hw = (uint32_t)h[0]<<24 | h[1]<<16 | h[2]<<8;
 	for (h+=3, k-=3; k; k--, hw = (hw|*h++)<<8)
 		if (hw == nw) return (char *)h-3;
 	return hw == nw ? (char *)h-3 : 0;
@@ -21,8 +21,8 @@ static char *threebyte_memmem(const unsigned char *h, size_t k, const unsigned c
 
 static char *fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
 {
-	uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
-	uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
+	uint32_t nw = (uint32_t)n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
+	uint32_t hw = (uint32_t)h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
 	for (h+=4, k-=4; k; k--, hw = hw<<8 | *h++)
 		if (hw == nw) return (char *)h-4;
 	return hw == nw ? (char *)h-4 : 0;
diff --git a/src/string/strcasestr.c b/src/string/strcasestr.c
index af109f36..dc598bc3 100644
--- a/src/string/strcasestr.c
+++ b/src/string/strcasestr.c
@@ -4,6 +4,7 @@
 char *strcasestr(const char *h, const char *n)
 {
 	size_t l = strlen(n);
+	if (!l) return (char *)h;
 	for (; *h; h++) if (!strncasecmp(h, n, l)) return (char *)h;
 	return 0;
 }
diff --git a/src/string/strsignal.c b/src/string/strsignal.c
index 96bfe841..5156366e 100644
--- a/src/string/strsignal.c
+++ b/src/string/strsignal.c
@@ -31,7 +31,11 @@ static const char map[] = {
 	[SIGPIPE]   = 13,
 	[SIGALRM]   = 14,
 	[SIGTERM]   = 15,
+#if defined(SIGSTKFLT)
 	[SIGSTKFLT] = 16,
+#elif defined(SIGEMT)
+	[SIGEMT]    = 16,
+#endif
 	[SIGCHLD]   = 17,
 	[SIGCONT]   = 18,
 	[SIGSTOP]   = 19,
@@ -70,7 +74,13 @@ static const char strings[] =
 	"Broken pipe\0"
 	"Alarm clock\0"
 	"Terminated\0"
+#if defined(SIGSTKFLT)
 	"Stack fault\0"
+#elif defined(SIGEMT)
+	"Emulator trap\0"
+#else
+	"Unknown signal\0"
+#endif
 	"Child process status\0"
 	"Continued\0"
 	"Stopped (signal)\0"
diff --git a/src/string/strstr.c b/src/string/strstr.c
index 55ba1c7b..96657bc2 100644
--- a/src/string/strstr.c
+++ b/src/string/strstr.c
@@ -10,16 +10,16 @@ static char *twobyte_strstr(const unsigned char *h, const unsigned char *n)
 
 static char *threebyte_strstr(const unsigned char *h, const unsigned char *n)
 {
-	uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8;
-	uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8;
+	uint32_t nw = (uint32_t)n[0]<<24 | n[1]<<16 | n[2]<<8;
+	uint32_t hw = (uint32_t)h[0]<<24 | h[1]<<16 | h[2]<<8;
 	for (h+=2; *h && hw != nw; hw = (hw|*++h)<<8);
 	return *h ? (char *)h-2 : 0;
 }
 
 static char *fourbyte_strstr(const unsigned char *h, const unsigned char *n)
 {
-	uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
-	uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
+	uint32_t nw = (uint32_t)n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
+	uint32_t hw = (uint32_t)h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
 	for (h+=3; *h && hw != nw; hw = hw<<8 | *++h);
 	return *h ? (char *)h-3 : 0;
 }
@@ -96,7 +96,7 @@ static char *twoway_strstr(const unsigned char *h, const unsigned char *n)
 	for (;;) {
 		/* Update incremental end-of-haystack pointer */
 		if (z-h < l) {
-			/* Fast estimate for MIN(l,63) */
+			/* Fast estimate for MAX(l,63) */
 			size_t grow = l | 63;
 			const unsigned char *z2 = memchr(z, 0, grow);
 			if (z2) {
diff --git a/src/string/strverscmp.c b/src/string/strverscmp.c
index 4daf276d..16c1da22 100644
--- a/src/string/strverscmp.c
+++ b/src/string/strverscmp.c
@@ -18,9 +18,9 @@ int strverscmp(const char *l0, const char *r0)
 		else if (c!='0') z=0;
 	}
 
-	if (l[dp]!='0' && r[dp]!='0') {
-		/* If we're not looking at a digit sequence that began
-		 * with a zero, longest digit string is greater. */
+	if (l[dp]-'1'<9U && r[dp]-'1'<9U) {
+		/* If we're looking at non-degenerate digit sequences starting
+		 * with nonzero digits, longest digit string is greater. */
 		for (j=i; isdigit(l[j]); j++)
 			if (!isdigit(r[j])) return 1;
 		if (isdigit(r[j])) return -1;
diff --git a/src/string/wcscmp.c b/src/string/wcscmp.c
index 26eeee70..286ec3ea 100644
--- a/src/string/wcscmp.c
+++ b/src/string/wcscmp.c
@@ -3,5 +3,5 @@
 int wcscmp(const wchar_t *l, const wchar_t *r)
 {
 	for (; *l==*r && *l && *r; l++, r++);
-	return *l - *r;
+	return *l < *r ? -1 : *l > *r;
 }
diff --git a/src/string/wcsncmp.c b/src/string/wcsncmp.c
index 4ab32a92..2b3558bf 100644
--- a/src/string/wcsncmp.c
+++ b/src/string/wcsncmp.c
@@ -3,5 +3,5 @@
 int wcsncmp(const wchar_t *l, const wchar_t *r, size_t n)
 {
 	for (; n && *l==*r && *l && *r; n--, l++, r++);
-	return n ? *l - *r : 0;
+	return n ? (*l < *r ? -1 : *l > *r) : 0;
 }
diff --git a/src/string/wmemcmp.c b/src/string/wmemcmp.c
index 2a193263..717d77b1 100644
--- a/src/string/wmemcmp.c
+++ b/src/string/wmemcmp.c
@@ -3,5 +3,5 @@
 int wmemcmp(const wchar_t *l, const wchar_t *r, size_t n)
 {
 	for (; n && *l==*r; n--, l++, r++);
-	return n ? *l-*r : 0;
+	return n ? (*l < *r ? -1 : *l > *r) : 0;
 }
diff --git a/src/temp/__randname.c b/src/temp/__randname.c
index 2bce37a0..e9b970f1 100644
--- a/src/temp/__randname.c
+++ b/src/temp/__randname.c
@@ -1,5 +1,6 @@
 #include <time.h>
 #include <stdint.h>
+#include "pthread_impl.h"
 
 /* This assumes that a check for the
    template size has already been made */
@@ -10,7 +11,7 @@ char *__randname(char *template)
 	unsigned long r;
 
 	__clock_gettime(CLOCK_REALTIME, &ts);
-	r = ts.tv_nsec*65537 ^ (uintptr_t)&ts / 16 + (uintptr_t)template;
+	r = ts.tv_sec + ts.tv_nsec + __pthread_self()->tid * 65537UL;
 	for (i=0; i<6; i++, r>>=5)
 		template[i] = 'A'+(r&15)+(r&16)*2;
 
diff --git a/src/temp/mkostemp.c b/src/temp/mkostemp.c
index d8dcb805..e3dfdd91 100644
--- a/src/temp/mkostemp.c
+++ b/src/temp/mkostemp.c
@@ -5,5 +5,3 @@ int mkostemp(char *template, int flags)
 {
 	return __mkostemps(template, 0, flags);
 }
-
-weak_alias(mkostemp, mkostemp64);
diff --git a/src/temp/mkostemps.c b/src/temp/mkostemps.c
index ef24eeae..093d2380 100644
--- a/src/temp/mkostemps.c
+++ b/src/temp/mkostemps.c
@@ -26,4 +26,3 @@ int __mkostemps(char *template, int len, int flags)
 }
 
 weak_alias(__mkostemps, mkostemps);
-weak_alias(__mkostemps, mkostemps64);
diff --git a/src/temp/mkstemp.c b/src/temp/mkstemp.c
index 166b8afe..76c835bb 100644
--- a/src/temp/mkstemp.c
+++ b/src/temp/mkstemp.c
@@ -4,5 +4,3 @@ int mkstemp(char *template)
 {
 	return __mkostemps(template, 0, 0);
 }
-
-weak_alias(mkstemp, mkstemp64);
diff --git a/src/temp/mkstemps.c b/src/temp/mkstemps.c
index 6b7531b5..f8eabfec 100644
--- a/src/temp/mkstemps.c
+++ b/src/temp/mkstemps.c
@@ -5,5 +5,3 @@ int mkstemps(char *template, int len)
 {
 	return __mkostemps(template, len, 0);
 }
-
-weak_alias(mkstemps, mkstemps64);
diff --git a/src/termios/cfgetospeed.c b/src/termios/cfgetospeed.c
index 55fa6f55..de46a1d8 100644
--- a/src/termios/cfgetospeed.c
+++ b/src/termios/cfgetospeed.c
@@ -9,5 +9,5 @@ speed_t cfgetospeed(const struct termios *tio)
 
 speed_t cfgetispeed(const struct termios *tio)
 {
-	return cfgetospeed(tio);
+	return (tio->c_cflag & CIBAUD) / (CIBAUD/CBAUD);
 }
diff --git a/src/termios/cfsetospeed.c b/src/termios/cfsetospeed.c
index c9cbdd9d..3eab092a 100644
--- a/src/termios/cfsetospeed.c
+++ b/src/termios/cfsetospeed.c
@@ -16,7 +16,11 @@ int cfsetospeed(struct termios *tio, speed_t speed)
 
 int cfsetispeed(struct termios *tio, speed_t speed)
 {
-	return speed ? cfsetospeed(tio, speed) : 0;
+	if (speed & ~CBAUD) {
+		errno = EINVAL;
+		return -1;
+	}
+	tio->c_cflag &= ~CIBAUD;
+	tio->c_cflag |= speed * (CIBAUD/CBAUD);
+	return 0;
 }
-
-weak_alias(cfsetospeed, cfsetspeed);
diff --git a/src/termios/cfsetspeed.c b/src/termios/cfsetspeed.c
new file mode 100644
index 00000000..2c369db9
--- /dev/null
+++ b/src/termios/cfsetspeed.c
@@ -0,0 +1,11 @@
+#define _BSD_SOURCE
+#include <termios.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+
+int cfsetspeed(struct termios *tio, speed_t speed)
+{
+	int r = cfsetospeed(tio, speed);
+	if (!r) cfsetispeed(tio, 0);
+	return r;
+}
diff --git a/src/termios/tcgetwinsize.c b/src/termios/tcgetwinsize.c
new file mode 100644
index 00000000..9b3a65a4
--- /dev/null
+++ b/src/termios/tcgetwinsize.c
@@ -0,0 +1,8 @@
+#include <termios.h>
+#include <sys/ioctl.h>
+#include "syscall.h"
+
+int tcgetwinsize(int fd, struct winsize *wsz)
+{
+	return syscall(SYS_ioctl, fd, TIOCGWINSZ, wsz);
+}
diff --git a/src/termios/tcsetwinsize.c b/src/termios/tcsetwinsize.c
new file mode 100644
index 00000000..e01d0e25
--- /dev/null
+++ b/src/termios/tcsetwinsize.c
@@ -0,0 +1,8 @@
+#include <termios.h>
+#include <sys/ioctl.h>
+#include "syscall.h"
+
+int tcsetwinsize(int fd, const struct winsize *wsz)
+{
+	return syscall(SYS_ioctl, fd, TIOCSWINSZ, wsz);
+}
diff --git a/src/thread/__lock.c b/src/thread/__lock.c
index 45557c88..60eece49 100644
--- a/src/thread/__lock.c
+++ b/src/thread/__lock.c
@@ -18,9 +18,11 @@
 
 void __lock(volatile int *l)
 {
-	if (!libc.threads_minus_1) return;
+	int need_locks = libc.need_locks;
+	if (!need_locks) return;
 	/* fast path: INT_MIN for the lock, +1 for the congestion */
 	int current = a_cas(l, 0, INT_MIN + 1);
+	if (need_locks < 0) libc.need_locks = 0;
 	if (!current) return;
 	/* A first spin loop, for medium congestion. */
 	for (unsigned i = 0; i < 10; ++i) {
diff --git a/src/thread/aarch64/clone.s b/src/thread/aarch64/clone.s
index e3c83395..9ac272bd 100644
--- a/src/thread/aarch64/clone.s
+++ b/src/thread/aarch64/clone.s
@@ -24,7 +24,8 @@ __clone:
 	// parent
 	ret
 	// child
-1:	ldp x1,x0,[sp],#16
+1:	mov fp, 0
+	ldp x1,x0,[sp],#16
 	blr x1
 	mov x8,#93 // SYS_exit
 	svc #0
diff --git a/src/thread/arm/clone.s b/src/thread/arm/clone.s
index bb0965da..4ff0c0e8 100644
--- a/src/thread/arm/clone.s
+++ b/src/thread/arm/clone.s
@@ -19,7 +19,8 @@ __clone:
 	ldmfd sp!,{r4,r5,r6,r7}
 	bx lr
 
-1:	mov r0,r6
+1:	mov fp,#0
+	mov r0,r6
 	bl 3f
 2:	mov r7,#1
 	svc 0
diff --git a/src/thread/i386/__set_thread_area.s b/src/thread/i386/__set_thread_area.s
index c2c21dd5..aa6852be 100644
--- a/src/thread/i386/__set_thread_area.s
+++ b/src/thread/i386/__set_thread_area.s
@@ -28,6 +28,7 @@ __set_thread_area:
 	ret
 2:
 	mov %ebx,%ecx
+	xor %eax,%eax
 	xor %ebx,%ebx
 	xor %edx,%edx
 	mov %ebx,(%esp)
diff --git a/src/thread/loongarch64/__set_thread_area.s b/src/thread/loongarch64/__set_thread_area.s
new file mode 100644
index 00000000..021307fc
--- /dev/null
+++ b/src/thread/loongarch64/__set_thread_area.s
@@ -0,0 +1,7 @@
+.global __set_thread_area
+.hidden __set_thread_area
+.type   __set_thread_area,@function
+__set_thread_area:
+	move $tp, $a0
+	move $a0, $zero
+	jr   $ra
diff --git a/src/thread/loongarch64/__unmapself.s b/src/thread/loongarch64/__unmapself.s
new file mode 100644
index 00000000..719ad056
--- /dev/null
+++ b/src/thread/loongarch64/__unmapself.s
@@ -0,0 +1,7 @@
+.global __unmapself
+.type   __unmapself, @function
+__unmapself:
+	li.d    $a7, 215   # call munmap
+	syscall 0
+	li.d    $a7, 93    # call exit
+	syscall 0
diff --git a/src/thread/loongarch64/clone.s b/src/thread/loongarch64/clone.s
new file mode 100644
index 00000000..cb4aacfc
--- /dev/null
+++ b/src/thread/loongarch64/clone.s
@@ -0,0 +1,30 @@
+#__clone(func, stack, flags, arg, ptid, tls, ctid)
+#         a0,    a1,   a2,    a3,  a4,  a5,   a6
+# sys_clone(flags, stack, ptid, ctid, tls)
+#            a0,    a1,   a2,    a3,  a4
+
+.global __clone
+.hidden __clone
+.type __clone,@function
+__clone:
+	bstrins.d $a1, $zero, 3, 0   #stack to 16 align
+	# Save function pointer and argument pointer on new thread stack
+	addi.d  $a1, $a1, -16
+	st.d    $a0, $a1, 0     # save function pointer
+	st.d    $a3, $a1, 8     # save argument pointer
+	or      $a0, $a2, $zero
+	or      $a2, $a4, $zero
+	or      $a3, $a6, $zero
+	or      $a4, $a5, $zero
+	ori     $a7, $zero, 220
+	syscall 0               # call clone
+
+	beqz    $a0, 1f         # whether child process
+	jirl    $zero, $ra, 0   # parent process return
+1:
+	move    $fp, $zero
+	ld.d    $t8, $sp, 0     # function pointer
+	ld.d    $a0, $sp, 8     # argument pointer
+	jirl    $ra, $t8, 0     # call the user's function
+	ori     $a7, $zero, 93
+	syscall 0               # child process exit
diff --git a/src/thread/loongarch64/syscall_cp.s b/src/thread/loongarch64/syscall_cp.s
new file mode 100644
index 00000000..c057a97b
--- /dev/null
+++ b/src/thread/loongarch64/syscall_cp.s
@@ -0,0 +1,29 @@
+.global __cp_begin
+.hidden __cp_begin
+.global __cp_end
+.hidden __cp_end
+.global __cp_cancel
+.hidden __cp_cancel
+.hidden __cancel
+.global __syscall_cp_asm
+.hidden __syscall_cp_asm
+.type   __syscall_cp_asm,@function
+
+__syscall_cp_asm:
+__cp_begin:
+	ld.w $a0, $a0, 0
+	bnez $a0, __cp_cancel
+	move $t8, $a1    # reserve system call number
+	move $a0, $a2
+	move $a1, $a3
+	move $a2, $a4
+	move $a3, $a5
+	move $a4, $a6
+	move $a5, $a7
+	move $a7, $t8
+	syscall 0
+__cp_end:
+	jr $ra
+__cp_cancel:
+	la.local $t8, __cancel
+	jr $t8
diff --git a/src/thread/m68k/clone.s b/src/thread/m68k/clone.s
index f6dfa06f..0134cf4e 100644
--- a/src/thread/m68k/clone.s
+++ b/src/thread/m68k/clone.s
@@ -18,7 +18,8 @@ __clone:
 	beq 1f
 	movem.l (%sp)+,%d2-%d5
 	rts
-1:	move.l %a1,-(%sp)
+1:	suba.l %fp,%fp
+	move.l %a1,-(%sp)
 	jsr (%a0)
 	move.l #1,%d0
 	trap #0
diff --git a/src/thread/microblaze/clone.s b/src/thread/microblaze/clone.s
index b68cc5fc..64e3f074 100644
--- a/src/thread/microblaze/clone.s
+++ b/src/thread/microblaze/clone.s
@@ -22,7 +22,8 @@ __clone:
 	rtsd    r15, 8
 	nop
 
-1:	lwi     r3, r1, 0
+1:	add     r19, r0, r0
+	lwi     r3, r1, 0
 	lwi     r5, r1, 4
 	brald   r15, r3
 	nop
diff --git a/src/thread/mips/clone.s b/src/thread/mips/clone.s
index 04463385..229b987e 100644
--- a/src/thread/mips/clone.s
+++ b/src/thread/mips/clone.s
@@ -27,7 +27,8 @@ __clone:
 	addu $sp, $sp, 16
 	jr $ra
 	nop
-1:	lw $25, 0($sp)
+1:	move $fp, $0
+	lw $25, 0($sp)
 	lw $4, 4($sp)
 	jalr $25
 	nop
diff --git a/src/thread/mips64/clone.s b/src/thread/mips64/clone.s
index 2d86899a..8de3db6c 100644
--- a/src/thread/mips64/clone.s
+++ b/src/thread/mips64/clone.s
@@ -25,7 +25,8 @@ __clone:
 	nop
 	jr	$ra
 	nop
-1:	ld	$25, 0($sp)	# function pointer
+1:	move	$fp, $0
+	ld	$25, 0($sp)	# function pointer
 	ld	$4, 8($sp)	# argument pointer
 	jalr	$25		# call the user's function
 	nop
diff --git a/src/thread/mipsn32/clone.s b/src/thread/mipsn32/clone.s
index 4d3c8c7a..9571231a 100644
--- a/src/thread/mipsn32/clone.s
+++ b/src/thread/mipsn32/clone.s
@@ -25,7 +25,8 @@ __clone:
 	nop
 	jr	$ra
 	nop
-1:	lw	$25, 0($sp)	# function pointer
+1:	move	$fp, $0
+	lw	$25, 0($sp)	# function pointer
 	lw	$4, 4($sp)	# argument pointer
 	jalr	$25		# call the user's function
 	nop
diff --git a/src/thread/or1k/clone.s b/src/thread/or1k/clone.s
index 2473ac20..b41488a2 100644
--- a/src/thread/or1k/clone.s
+++ b/src/thread/or1k/clone.s
@@ -6,6 +6,8 @@
 .hidden __clone
 .type   __clone,@function
 __clone:
+	l.xori	r11, r0, -4
+	l.and	r4, r4, r11
 	l.addi	r4, r4, -8
 	l.sw	0(r4), r3
 	l.sw	4(r4), r6
@@ -23,7 +25,8 @@ __clone:
 	l.jr	r9
 	 l.nop
 
-1:	l.lwz	r11, 0(r1)
+1:	l.ori	r2, r0, 0
+	l.lwz	r11, 0(r1)
 	l.jalr	r11
 	 l.lwz	r3, 4(r1)
 
diff --git a/src/thread/pthread_atfork.c b/src/thread/pthread_atfork.c
index 76497401..26d32543 100644
--- a/src/thread/pthread_atfork.c
+++ b/src/thread/pthread_atfork.c
@@ -1,7 +1,13 @@
 #include <pthread.h>
+#include <errno.h>
 #include "libc.h"
 #include "lock.h"
 
+#define malloc __libc_malloc
+#define calloc undef
+#define realloc undef
+#define free undef
+
 static struct atfork_funcs {
 	void (*prepare)(void);
 	void (*parent)(void);
@@ -34,7 +40,7 @@ void __fork_handler(int who)
 int pthread_atfork(void (*prepare)(void), void (*parent)(void), void (*child)(void))
 {
 	struct atfork_funcs *new = malloc(sizeof *new);
-	if (!new) return -1;
+	if (!new) return ENOMEM;
 
 	LOCK(lock);
 	new->next = funcs;
diff --git a/src/thread/pthread_attr_get.c b/src/thread/pthread_attr_get.c
index 4aa5afdb..f12ff442 100644
--- a/src/thread/pthread_attr_get.c
+++ b/src/thread/pthread_attr_get.c
@@ -70,7 +70,7 @@ int pthread_condattr_getpshared(const pthread_condattr_t *restrict a, int *restr
 
 int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict a, int *restrict protocol)
 {
-	*protocol = PTHREAD_PRIO_NONE;
+	*protocol = a->__attr / 8U % 2;
 	return 0;
 }
 int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict a, int *restrict pshared)
diff --git a/src/thread/pthread_cancel.c b/src/thread/pthread_cancel.c
index 2f9d5e97..139a6fc8 100644
--- a/src/thread/pthread_cancel.c
+++ b/src/thread/pthread_cancel.c
@@ -56,7 +56,12 @@ static void cancel_handler(int sig, siginfo_t *si, void *ctx)
 
 	_sigaddset(&uc->uc_sigmask, SIGCANCEL);
 
-	if (self->cancelasync || pc >= (uintptr_t)__cp_begin && pc < (uintptr_t)__cp_end) {
+	if (self->cancelasync) {
+		pthread_sigmask(SIG_SETMASK, &uc->uc_sigmask, 0);
+		__cancel();
+	}
+
+	if (pc >= (uintptr_t)__cp_begin && pc < (uintptr_t)__cp_end) {
 		uc->uc_mcontext.MC_PC = (uintptr_t)__cp_cancel;
 #ifdef CANCEL_GOT
 		uc->uc_mcontext.MC_GOT = CANCEL_GOT;
@@ -77,7 +82,7 @@ void __testcancel()
 static void init_cancellation()
 {
 	struct sigaction sa = {
-		.sa_flags = SA_SIGINFO | SA_RESTART,
+		.sa_flags = SA_SIGINFO | SA_RESTART | SA_ONSTACK,
 		.sa_sigaction = cancel_handler
 	};
 	memset(&sa.sa_mask, -1, _NSIG/8);
diff --git a/src/thread/pthread_cond_timedwait.c b/src/thread/pthread_cond_timedwait.c
index d1501240..6b761455 100644
--- a/src/thread/pthread_cond_timedwait.c
+++ b/src/thread/pthread_cond_timedwait.c
@@ -146,14 +146,18 @@ relock:
 
 	if (oldstate == WAITING) goto done;
 
-	if (!node.next) a_inc(&m->_m_waiters);
+	if (!node.next && !(m->_m_type & 8))
+		a_inc(&m->_m_waiters);
 
 	/* Unlock the barrier that's holding back the next waiter, and
 	 * either wake it or requeue it to the mutex. */
-	if (node.prev)
-		unlock_requeue(&node.prev->barrier, &m->_m_lock, m->_m_type & 128);
-	else
-		a_dec(&m->_m_waiters);
+	if (node.prev) {
+		int val = m->_m_lock;
+		if (val>0) a_cas(&m->_m_lock, val, val|0x80000000);
+		unlock_requeue(&node.prev->barrier, &m->_m_lock, m->_m_type & (8|128));
+	} else if (!(m->_m_type & 8)) {
+		a_dec(&m->_m_waiters);		
+	}
 
 	/* Since a signal was consumed, cancellation is not permitted. */
 	if (e == ECANCELED) e = 0;
diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c
index 5f491092..087f6206 100644
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -69,15 +69,29 @@ _Noreturn void __pthread_exit(void *result)
 
 	__pthread_tsd_run_dtors();
 
+	__block_app_sigs(&set);
+
+	/* This atomic potentially competes with a concurrent pthread_detach
+	 * call; the loser is responsible for freeing thread resources. */
+	int state = a_cas(&self->detach_state, DT_JOINABLE, DT_EXITING);
+
+	if (state==DT_DETACHED && self->map_base) {
+		/* Since __unmapself bypasses the normal munmap code path,
+		 * explicitly wait for vmlock holders first. This must be
+		 * done before any locks are taken, to avoid lock ordering
+		 * issues that could lead to deadlock. */
+		__vm_wait();
+	}
+
 	/* Access to target the exiting thread with syscalls that use
 	 * its kernel tid is controlled by killlock. For detached threads,
 	 * any use past this point would have undefined behavior, but for
-	 * joinable threads it's a valid usage that must be handled. */
+	 * joinable threads it's a valid usage that must be handled.
+	 * Signals must be blocked since pthread_kill must be AS-safe. */
 	LOCK(self->killlock);
 
-	/* The thread list lock must be AS-safe, and thus requires
-	 * application signals to be blocked before it can be taken. */
-	__block_app_sigs(&set);
+	/* The thread list lock must be AS-safe, and thus depends on
+	 * application signals being blocked above. */
 	__tl_lock();
 
 	/* If this is the only thread in the list, don't proceed with
@@ -85,19 +99,23 @@ _Noreturn void __pthread_exit(void *result)
 	 * signal state to prepare for exit to call atexit handlers. */
 	if (self->next == self) {
 		__tl_unlock();
-		__restore_sigs(&set);
 		UNLOCK(self->killlock);
+		self->detach_state = state;
+		__restore_sigs(&set);
 		exit(0);
 	}
 
-	/* At this point we are committed to thread termination. Unlink
-	 * the thread from the list. This change will not be visible
-	 * until the lock is released, which only happens after SYS_exit
-	 * has been called, via the exit futex address pointing at the lock. */
-	libc.threads_minus_1--;
-	self->next->prev = self->prev;
-	self->prev->next = self->next;
-	self->prev = self->next = self;
+	/* At this point we are committed to thread termination. */
+
+	/* After the kernel thread exits, its tid may be reused. Clear it
+	 * to prevent inadvertent use and inform functions that would use
+	 * it that it's no longer available. At this point the killlock
+	 * may be released, since functions that use it will consistently
+	 * see the thread as having exited. Release it now so that no
+	 * remaining locks (except thread list) are held if we end up
+	 * resetting need_locks below. */
+	self->tid = 0;
+	UNLOCK(self->killlock);
 
 	/* Process robust list in userspace to handle non-pshared mutexes
 	 * and the detached thread case where the robust list head will
@@ -121,9 +139,15 @@ _Noreturn void __pthread_exit(void *result)
 	__do_orphaned_stdio_locks();
 	__dl_thread_cleanup();
 
-	/* This atomic potentially competes with a concurrent pthread_detach
-	 * call; the loser is responsible for freeing thread resources. */
-	int state = a_cas(&self->detach_state, DT_JOINABLE, DT_EXITING);
+	/* Last, unlink thread from the list. This change will not be visible
+	 * until the lock is released, which only happens after SYS_exit
+	 * has been called, via the exit futex address pointing at the lock.
+	 * This needs to happen after any possible calls to LOCK() that might
+	 * skip locking if process appears single-threaded. */
+	if (!--libc.threads_minus_1) libc.need_locks = -1;
+	self->next->prev = self->prev;
+	self->prev->next = self->next;
+	self->prev = self->next = self;
 
 	if (state==DT_DETACHED && self->map_base) {
 		/* Detached threads must block even implementation-internal
@@ -136,24 +160,15 @@ _Noreturn void __pthread_exit(void *result)
 		if (self->robust_list.off)
 			__syscall(SYS_set_robust_list, 0, 3*sizeof(long));
 
-		/* Since __unmapself bypasses the normal munmap code path,
-		 * explicitly wait for vmlock holders first. */
-		__vm_wait();
-
 		/* The following call unmaps the thread's stack mapping
 		 * and then exits without touching the stack. */
 		__unmapself(self->map_base, self->map_size);
 	}
 
 	/* Wake any joiner. */
+	a_store(&self->detach_state, DT_EXITED);
 	__wake(&self->detach_state, 1, 1);
 
-	/* After the kernel thread exits, its tid may be reused. Clear it
-	 * to prevent inadvertent use and inform functions that would use
-	 * it that it's no longer available. */
-	self->tid = 0;
-	UNLOCK(self->killlock);
-
 	for (;;) __syscall(SYS_exit, 0);
 }
 
@@ -310,7 +325,7 @@ int __pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict att
 		new->detach_state = DT_JOINABLE;
 	}
 	new->robust_list.head = &new->robust_list.head;
-	new->CANARY = self->CANARY;
+	new->canary = self->canary;
 	new->sysinfo = self->sysinfo;
 
 	/* Setup argument structure for the new thread on its stack.
@@ -336,7 +351,7 @@ int __pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict att
 		~(1UL<<((SIGCANCEL-1)%(8*sizeof(long))));
 
 	__tl_lock();
-	libc.threads_minus_1++;
+	if (!libc.threads_minus_1++) libc.need_locks = 1;
 	ret = __clone((c11 ? start_c11 : start), stack, flags, args, &new->tid, TP_ADJ(new), &__thread_list_lock);
 
 	/* All clone failures translate to EAGAIN. If explicit scheduling
@@ -360,7 +375,7 @@ int __pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict att
 		new->next->prev = new;
 		new->prev->next = new;
 	} else {
-		libc.threads_minus_1--;
+		if (!--libc.threads_minus_1) libc.need_locks = 0;
 	}
 	__tl_unlock();
 	__restore_sigs(&set);
diff --git a/src/thread/pthread_detach.c b/src/thread/pthread_detach.c
index 77772af2..d73a500e 100644
--- a/src/thread/pthread_detach.c
+++ b/src/thread/pthread_detach.c
@@ -5,8 +5,12 @@ static int __pthread_detach(pthread_t t)
 {
 	/* If the cas fails, detach state is either already-detached
 	 * or exiting/exited, and pthread_join will trap or cleanup. */
-	if (a_cas(&t->detach_state, DT_JOINABLE, DT_DETACHED) != DT_JOINABLE)
-		return __pthread_join(t, 0);
+	if (a_cas(&t->detach_state, DT_JOINABLE, DT_DETACHED) != DT_JOINABLE) {
+		int cs;
+		__pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
+		__pthread_join(t, 0);
+		__pthread_setcancelstate(cs, 0);
+	}
 	return 0;
 }
 
diff --git a/src/thread/pthread_getname_np.c b/src/thread/pthread_getname_np.c
new file mode 100644
index 00000000..85504e45
--- /dev/null
+++ b/src/thread/pthread_getname_np.c
@@ -0,0 +1,25 @@
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/prctl.h>
+
+#include "pthread_impl.h"
+
+int pthread_getname_np(pthread_t thread, char *name, size_t len)
+{
+	int fd, cs, status = 0;
+	char f[sizeof "/proc/self/task//comm" + 3*sizeof(int)];
+
+	if (len < 16) return ERANGE;
+
+	if (thread == pthread_self())
+		return prctl(PR_GET_NAME, (unsigned long)name, 0UL, 0UL, 0UL) ? errno : 0;
+
+	snprintf(f, sizeof f, "/proc/self/task/%d/comm", thread->tid);
+	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
+	if ((fd = open(f, O_RDONLY|O_CLOEXEC)) < 0 || (len = read(fd, name, len)) == -1) status = errno;
+	else name[len-1] = 0; /* remove trailing new line only if successful */
+	if (fd >= 0) close(fd);
+	pthread_setcancelstate(cs, 0);
+	return status;
+}
diff --git a/src/thread/pthread_getschedparam.c b/src/thread/pthread_getschedparam.c
index 1cba073d..c098befb 100644
--- a/src/thread/pthread_getschedparam.c
+++ b/src/thread/pthread_getschedparam.c
@@ -4,6 +4,8 @@
 int pthread_getschedparam(pthread_t t, int *restrict policy, struct sched_param *restrict param)
 {
 	int r;
+	sigset_t set;
+	__block_app_sigs(&set);
 	LOCK(t->killlock);
 	if (!t->tid) {
 		r = ESRCH;
@@ -14,5 +16,6 @@ int pthread_getschedparam(pthread_t t, int *restrict policy, struct sched_param
 		}
 	}
 	UNLOCK(t->killlock);
+	__restore_sigs(&set);
 	return r;
 }
diff --git a/src/thread/pthread_key_create.c b/src/thread/pthread_key_create.c
index d1120941..39770c7a 100644
--- a/src/thread/pthread_key_create.c
+++ b/src/thread/pthread_key_create.c
@@ -1,4 +1,5 @@
 #include "pthread_impl.h"
+#include "fork_impl.h"
 
 volatile size_t __pthread_tsd_size = sizeof(void *) * PTHREAD_KEYS_MAX;
 void *__pthread_tsd_main[PTHREAD_KEYS_MAX] = { 0 };
@@ -20,6 +21,13 @@ static void dummy_0(void)
 weak_alias(dummy_0, __tl_lock);
 weak_alias(dummy_0, __tl_unlock);
 
+void __pthread_key_atfork(int who)
+{
+	if (who<0) __pthread_rwlock_rdlock(&key_lock);
+	else if (!who) __pthread_rwlock_unlock(&key_lock);
+	else key_lock = (pthread_rwlock_t)PTHREAD_RWLOCK_INITIALIZER;
+}
+
 int __pthread_key_create(pthread_key_t *k, void (*dtor)(void *))
 {
 	pthread_t self = __pthread_self();
diff --git a/src/thread/pthread_kill.c b/src/thread/pthread_kill.c
index 3d9395cb..79ddb209 100644
--- a/src/thread/pthread_kill.c
+++ b/src/thread/pthread_kill.c
@@ -4,9 +4,15 @@
 int pthread_kill(pthread_t t, int sig)
 {
 	int r;
+	sigset_t set;
+	/* Block not just app signals, but internal ones too, since
+	 * pthread_kill is used to implement pthread_cancel, which
+	 * must be async-cancel-safe. */
+	__block_all_sigs(&set);
 	LOCK(t->killlock);
 	r = t->tid ? -__syscall(SYS_tkill, t->tid, sig)
 		: (sig+0U >= _NSIG ? EINVAL : 0);
 	UNLOCK(t->killlock);
+	__restore_sigs(&set);
 	return r;
 }
diff --git a/src/thread/pthread_mutex_destroy.c b/src/thread/pthread_mutex_destroy.c
index 6d49e689..8d1bf77b 100644
--- a/src/thread/pthread_mutex_destroy.c
+++ b/src/thread/pthread_mutex_destroy.c
@@ -1,6 +1,10 @@
-#include <pthread.h>
+#include "pthread_impl.h"
 
 int pthread_mutex_destroy(pthread_mutex_t *mutex)
 {
+	/* If the mutex being destroyed is process-shared and has nontrivial
+	 * type (tracking ownership), it might be in the pending slot of a
+	 * robust_list; wait for quiescence. */
+	if (mutex->_m_type > 128) __vm_wait();
 	return 0;
 }
diff --git a/src/thread/pthread_mutexattr_setprotocol.c b/src/thread/pthread_mutexattr_setprotocol.c
index 511cc32d..8b80c1ce 100644
--- a/src/thread/pthread_mutexattr_setprotocol.c
+++ b/src/thread/pthread_mutexattr_setprotocol.c
@@ -1,24 +1,23 @@
 #include "pthread_impl.h"
 #include "syscall.h"
 
-static pthread_once_t check_pi_once;
-static int check_pi_result;
-
-static void check_pi()
-{
-	volatile int lk = 0;
-	check_pi_result = -__syscall(SYS_futex, &lk, FUTEX_LOCK_PI, 0, 0);
-}
+static volatile int check_pi_result = -1;
 
 int pthread_mutexattr_setprotocol(pthread_mutexattr_t *a, int protocol)
 {
+	int r;
 	switch (protocol) {
 	case PTHREAD_PRIO_NONE:
 		a->__attr &= ~8;
 		return 0;
 	case PTHREAD_PRIO_INHERIT:
-		pthread_once(&check_pi_once, check_pi);
-		if (check_pi_result) return check_pi_result;
+		r = check_pi_result;
+		if (r < 0) {
+			volatile int lk = 0;
+			r = -__syscall(SYS_futex, &lk, FUTEX_LOCK_PI, 0, 0);
+			a_store(&check_pi_result, r);
+		}
+		if (r) return r;
 		a->__attr |= 8;
 		return 0;
 	case PTHREAD_PRIO_PROTECT:
diff --git a/src/thread/pthread_mutexattr_setrobust.c b/src/thread/pthread_mutexattr_setrobust.c
index 04db92a6..30a9ac3b 100644
--- a/src/thread/pthread_mutexattr_setrobust.c
+++ b/src/thread/pthread_mutexattr_setrobust.c
@@ -1,22 +1,20 @@
 #include "pthread_impl.h"
 #include "syscall.h"
 
-static pthread_once_t check_robust_once;
-static int check_robust_result;
-
-static void check_robust()
-{
-	void *p;
-	size_t l;
-	check_robust_result = -__syscall(SYS_get_robust_list, 0, &p, &l);
-}
+static volatile int check_robust_result = -1;
 
 int pthread_mutexattr_setrobust(pthread_mutexattr_t *a, int robust)
 {
 	if (robust > 1U) return EINVAL;
 	if (robust) {
-		pthread_once(&check_robust_once, check_robust);
-		if (check_robust_result) return check_robust_result;
+		int r = check_robust_result;
+		if (r < 0) {
+			void *p;
+			size_t l;
+			r = -__syscall(SYS_get_robust_list, 0, &p, &l);
+			a_store(&check_robust_result, r);
+		}
+		if (r) return r;
 		a->__attr |= 4;
 		return 0;
 	}
diff --git a/src/thread/pthread_setname_np.c b/src/thread/pthread_setname_np.c
index 82d35e17..fc2d2306 100644
--- a/src/thread/pthread_setname_np.c
+++ b/src/thread/pthread_setname_np.c
@@ -19,7 +19,7 @@ int pthread_setname_np(pthread_t thread, const char *name)
 
 	snprintf(f, sizeof f, "/proc/self/task/%d/comm", thread->tid);
 	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
-	if ((fd = open(f, O_WRONLY)) < 0 || write(fd, name, len) < 0) status = errno;
+	if ((fd = open(f, O_WRONLY|O_CLOEXEC)) < 0 || write(fd, name, len) < 0) status = errno;
 	if (fd >= 0) close(fd);
 	pthread_setcancelstate(cs, 0);
 	return status;
diff --git a/src/thread/pthread_setschedparam.c b/src/thread/pthread_setschedparam.c
index 038d13d8..76d4d45a 100644
--- a/src/thread/pthread_setschedparam.c
+++ b/src/thread/pthread_setschedparam.c
@@ -4,8 +4,11 @@
 int pthread_setschedparam(pthread_t t, int policy, const struct sched_param *param)
 {
 	int r;
+	sigset_t set;
+	__block_app_sigs(&set);
 	LOCK(t->killlock);
 	r = !t->tid ? ESRCH : -__syscall(SYS_sched_setscheduler, t->tid, policy, param);
 	UNLOCK(t->killlock);
+	__restore_sigs(&set);
 	return r;
 }
diff --git a/src/thread/pthread_setschedprio.c b/src/thread/pthread_setschedprio.c
index 5bf4a019..fc2e13dd 100644
--- a/src/thread/pthread_setschedprio.c
+++ b/src/thread/pthread_setschedprio.c
@@ -4,8 +4,11 @@
 int pthread_setschedprio(pthread_t t, int prio)
 {
 	int r;
+	sigset_t set;
+	__block_app_sigs(&set);
 	LOCK(t->killlock);
 	r = !t->tid ? ESRCH : -__syscall(SYS_sched_setparam, t->tid, &prio);
 	UNLOCK(t->killlock);
+	__restore_sigs(&set);
 	return r;
 }
diff --git a/src/thread/riscv32/__set_thread_area.s b/src/thread/riscv32/__set_thread_area.s
new file mode 100644
index 00000000..828154d2
--- /dev/null
+++ b/src/thread/riscv32/__set_thread_area.s
@@ -0,0 +1,6 @@
+.global __set_thread_area
+.type   __set_thread_area, %function
+__set_thread_area:
+	mv tp, a0
+	li a0, 0
+	ret
diff --git a/src/thread/riscv32/__unmapself.s b/src/thread/riscv32/__unmapself.s
new file mode 100644
index 00000000..2849119c
--- /dev/null
+++ b/src/thread/riscv32/__unmapself.s
@@ -0,0 +1,7 @@
+.global __unmapself
+.type __unmapself, %function
+__unmapself:
+	li a7, 215 # SYS_munmap
+	ecall
+	li a7, 93  # SYS_exit
+	ecall
diff --git a/src/thread/riscv32/clone.s b/src/thread/riscv32/clone.s
new file mode 100644
index 00000000..e2116e33
--- /dev/null
+++ b/src/thread/riscv32/clone.s
@@ -0,0 +1,35 @@
+# __clone(func, stack, flags, arg, ptid, tls, ctid)
+#           a0,    a1,    a2,  a3,   a4,  a5,   a6
+
+# syscall(SYS_clone, flags, stack, ptid, tls, ctid)
+#                a7     a0,    a1,   a2,  a3,   a4
+
+.global __clone
+.type  __clone, %function
+__clone:
+	# Save func and arg to stack
+	andi a1, a1, -16
+	addi a1, a1, -16
+	sw a0, 0(a1)
+	sw a3, 4(a1)
+
+	# Call SYS_clone
+	mv a0, a2
+	mv a2, a4
+	mv a3, a5
+	mv a4, a6
+	li a7, 220 # SYS_clone
+	ecall
+
+	beqz a0, 1f
+	# Parent
+	ret
+
+	# Child
+1:      lw a1, 0(sp)
+	lw a0, 4(sp)
+	jalr a1
+
+	# Exit
+	li a7, 93 # SYS_exit
+	ecall
diff --git a/src/thread/riscv32/syscall_cp.s b/src/thread/riscv32/syscall_cp.s
new file mode 100644
index 00000000..079d1ba0
--- /dev/null
+++ b/src/thread/riscv32/syscall_cp.s
@@ -0,0 +1,29 @@
+.global __cp_begin
+.hidden __cp_begin
+.global __cp_end
+.hidden __cp_end
+.global __cp_cancel
+.hidden __cp_cancel
+.hidden __cancel
+.global __syscall_cp_asm
+.hidden __syscall_cp_asm
+.type __syscall_cp_asm, %function
+__syscall_cp_asm:
+__cp_begin:
+	lw t0, 0(a0)
+	bnez t0, __cp_cancel
+
+	mv t0, a1
+	mv a0, a2
+	mv a1, a3
+	mv a2, a4
+	mv a3, a5
+	mv a4, a6
+	mv a5, a7
+	lw a6, 0(sp)
+	mv a7, t0
+	ecall
+__cp_end:
+	ret
+__cp_cancel:
+	tail __cancel
diff --git a/src/thread/riscv64/clone.s b/src/thread/riscv64/clone.s
index db908248..0e6f41a8 100644
--- a/src/thread/riscv64/clone.s
+++ b/src/thread/riscv64/clone.s
@@ -8,6 +8,7 @@
 .type  __clone, %function
 __clone:
 	# Save func and arg to stack
+	andi a1, a1, -16
 	addi a1, a1, -16
 	sd a0, 0(a1)
 	sd a3, 8(a1)
diff --git a/src/thread/s390x/__tls_get_offset.s b/src/thread/s390x/__tls_get_offset.s
index 8ee92de8..405f118b 100644
--- a/src/thread/s390x/__tls_get_offset.s
+++ b/src/thread/s390x/__tls_get_offset.s
@@ -1,17 +1,17 @@
 	.global __tls_get_offset
 	.type __tls_get_offset,%function
 __tls_get_offset:
-	stmg  %r14, %r15, 112(%r15)
-	aghi  %r15, -160
+	ear   %r0, %a0
+	sllg  %r0, %r0, 32
+	ear   %r0, %a1
 
-	la    %r2, 0(%r2, %r12)
-	brasl %r14, __tls_get_addr
+	la    %r1, 0(%r2, %r12)
 
-	ear   %r1, %a0
-	sllg  %r1, %r1, 32
-	ear   %r1, %a1
+	lg    %r3, 0(%r1)
+	sllg  %r4, %r3, 3
+	lg    %r5, 8(%r0)
+	lg    %r2, 0(%r4, %r5)
+	ag    %r2, 8(%r1)
+	sgr   %r2, %r0
 
-	sgr   %r2, %r1
-
-	lmg   %r14, %r15, 272(%r15)
 	br    %r14
diff --git a/src/thread/s390x/clone.s b/src/thread/s390x/clone.s
index 577748ea..2125f20b 100644
--- a/src/thread/s390x/clone.s
+++ b/src/thread/s390x/clone.s
@@ -17,6 +17,9 @@ __clone:
 	# if (!tid) syscall(SYS_exit, a(d));
 	# return tid;
 
+	# preserve call-saved register used as syscall arg
+	stg  %r6, 48(%r15)
+
 	# create initial stack frame for new thread
 	nill %r3, 0xfff8
 	aghi %r3, -160
@@ -35,6 +38,9 @@ __clone:
 	lg   %r6, 160(%r15)
 	svc  120
 
+	# restore call-saved register
+	lg   %r6, 48(%r15)
+
 	# if error or if we're the parent, return
 	ltgr %r2, %r2
 	bnzr %r14
diff --git a/src/thread/s390x/syscall_cp.s b/src/thread/s390x/syscall_cp.s
index c1da40de..d094cbf5 100644
--- a/src/thread/s390x/syscall_cp.s
+++ b/src/thread/s390x/syscall_cp.s
@@ -14,6 +14,7 @@ __cp_begin:
 	icm %r2, 15, 0(%r2)
 	jne __cp_cancel
 
+	stg %r6, 48(%r15)
 	stg %r7, 56(%r15)
 	lgr %r1, %r3
 	lgr %r2, %r4
@@ -26,6 +27,7 @@ __cp_begin:
 
 __cp_end:
 	lg  %r7, 56(%r15)
+	lg  %r6, 48(%r15)
 	br  %r14
 
 __cp_cancel:
diff --git a/src/thread/sem_getvalue.c b/src/thread/sem_getvalue.c
index d9d83071..c0b7762d 100644
--- a/src/thread/sem_getvalue.c
+++ b/src/thread/sem_getvalue.c
@@ -1,8 +1,9 @@
 #include <semaphore.h>
+#include <limits.h>
 
 int sem_getvalue(sem_t *restrict sem, int *restrict valp)
 {
 	int val = sem->__val[0];
-	*valp = val < 0 ? 0 : val;
+	*valp = val & SEM_VALUE_MAX;
 	return 0;
 }
diff --git a/src/thread/sem_open.c b/src/thread/sem_open.c
index de8555c5..0ad29de9 100644
--- a/src/thread/sem_open.c
+++ b/src/thread/sem_open.c
@@ -12,6 +12,12 @@
 #include <stdlib.h>
 #include <pthread.h>
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc undef
+#define free undef
 
 static struct {
 	ino_t ino;
@@ -19,6 +25,7 @@ static struct {
 	int refcnt;
 } *semtab;
 static volatile int lock[1];
+volatile int *const __sem_open_lockptr = lock;
 
 #define FLAGS (O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NONBLOCK)
 
@@ -163,10 +170,12 @@ int sem_close(sem_t *sem)
 	int i;
 	LOCK(lock);
 	for (i=0; i<SEM_NSEMS_MAX && semtab[i].sem != sem; i++);
-	if (!--semtab[i].refcnt) {
-		semtab[i].sem = 0;
-		semtab[i].ino = 0;
+	if (--semtab[i].refcnt) {
+		UNLOCK(lock);
+		return 0;
 	}
+	semtab[i].sem = 0;
+	semtab[i].ino = 0;
 	UNLOCK(lock);
 	munmap(sem, sizeof *sem);
 	return 0;
diff --git a/src/thread/sem_post.c b/src/thread/sem_post.c
index 31e3293d..1c8f763c 100644
--- a/src/thread/sem_post.c
+++ b/src/thread/sem_post.c
@@ -1,17 +1,21 @@
 #include <semaphore.h>
+#include <limits.h>
 #include "pthread_impl.h"
 
 int sem_post(sem_t *sem)
 {
-	int val, waiters, priv = sem->__val[2];
+	int val, new, waiters, priv = sem->__val[2];
 	do {
 		val = sem->__val[0];
 		waiters = sem->__val[1];
-		if (val == SEM_VALUE_MAX) {
+		if ((val & SEM_VALUE_MAX) == SEM_VALUE_MAX) {
 			errno = EOVERFLOW;
 			return -1;
 		}
-	} while (a_cas(sem->__val, val, val+1+(val<0)) != val);
-	if (val<0 || waiters) __wake(sem->__val, 1, priv);
+		new = val + 1;
+		if (waiters <= 1)
+			new &= ~0x80000000;
+	} while (a_cas(sem->__val, val, new) != val);
+	if (val<0 || waiters) __wake(sem->__val, waiters>1 ? 1 : -1, priv);
 	return 0;
 }
diff --git a/src/thread/sem_timedwait.c b/src/thread/sem_timedwait.c
index 58d3ebfe..aa67376c 100644
--- a/src/thread/sem_timedwait.c
+++ b/src/thread/sem_timedwait.c
@@ -1,4 +1,5 @@
 #include <semaphore.h>
+#include <limits.h>
 #include "pthread_impl.h"
 
 static void cleanup(void *p)
@@ -13,14 +14,15 @@ int sem_timedwait(sem_t *restrict sem, const struct timespec *restrict at)
 	if (!sem_trywait(sem)) return 0;
 
 	int spins = 100;
-	while (spins-- && sem->__val[0] <= 0 && !sem->__val[1]) a_spin();
+	while (spins-- && !(sem->__val[0] & SEM_VALUE_MAX) && !sem->__val[1])
+		a_spin();
 
 	while (sem_trywait(sem)) {
-		int r;
+		int r, priv = sem->__val[2];
 		a_inc(sem->__val+1);
-		a_cas(sem->__val, 0, -1);
+		a_cas(sem->__val, 0, 0x80000000);
 		pthread_cleanup_push(cleanup, (void *)(sem->__val+1));
-		r = __timedwait_cp(sem->__val, -1, CLOCK_REALTIME, at, sem->__val[2]);
+		r = __timedwait_cp(sem->__val, 0x80000000, CLOCK_REALTIME, at, priv);
 		pthread_cleanup_pop(1);
 		if (r) {
 			errno = r;
diff --git a/src/thread/sem_trywait.c b/src/thread/sem_trywait.c
index 04edf46b..beb435da 100644
--- a/src/thread/sem_trywait.c
+++ b/src/thread/sem_trywait.c
@@ -1,12 +1,12 @@
 #include <semaphore.h>
+#include <limits.h>
 #include "pthread_impl.h"
 
 int sem_trywait(sem_t *sem)
 {
 	int val;
-	while ((val=sem->__val[0]) > 0) {
-		int new = val-1-(val==1 && sem->__val[1]);
-		if (a_cas(sem->__val, val, new)==val) return 0;
+	while ((val=sem->__val[0]) & SEM_VALUE_MAX) {
+		if (a_cas(sem->__val, val, val-1)==val) return 0;
 	}
 	errno = EAGAIN;
 	return -1;
diff --git a/src/thread/synccall.c b/src/thread/synccall.c
index 648a6ad4..38597254 100644
--- a/src/thread/synccall.c
+++ b/src/thread/synccall.c
@@ -11,7 +11,7 @@ weak_alias(dummy_0, __tl_unlock);
 
 static int target_tid;
 static void (*callback)(void *), *context;
-static sem_t target_sem, caller_sem;
+static sem_t target_sem, caller_sem, exit_sem;
 
 static void dummy(void *p)
 {
@@ -33,7 +33,7 @@ static void handler(int sig)
 	/* Inform caller we've complered the callback and wait
 	 * for the caller to release us to return. */
 	sem_post(&caller_sem);
-	sem_wait(&target_sem);
+	sem_wait(&exit_sem);
 
 	/* Inform caller we are returning and state is destroyable. */
 	sem_post(&caller_sem);
@@ -45,7 +45,7 @@ void __synccall(void (*func)(void *), void *ctx)
 {
 	sigset_t oldmask;
 	int cs, i, r;
-	struct sigaction sa = { .sa_flags = SA_RESTART, .sa_handler = handler };
+	struct sigaction sa = { .sa_flags = SA_RESTART | SA_ONSTACK, .sa_handler = handler };
 	pthread_t self = __pthread_self(), td;
 	int count = 0;
 
@@ -62,8 +62,10 @@ void __synccall(void (*func)(void *), void *ctx)
 
 	sem_init(&target_sem, 0, 0);
 	sem_init(&caller_sem, 0, 0);
+	sem_init(&exit_sem, 0, 0);
 
-	if (!libc.threads_minus_1) goto single_threaded;
+	if (!libc.threads_minus_1 || __syscall(SYS_gettid) != self->tid)
+		goto single_threaded;
 
 	callback = func;
 	context = ctx;
@@ -106,12 +108,13 @@ single_threaded:
 	/* Only release the caught threads once all threads, including the
 	 * caller, have returned from the callback function. */
 	for (i=0; i<count; i++)
-		sem_post(&target_sem);
+		sem_post(&exit_sem);
 	for (i=0; i<count; i++)
 		sem_wait(&caller_sem);
 
 	sem_destroy(&caller_sem);
 	sem_destroy(&target_sem);
+	sem_destroy(&exit_sem);
 
 	pthread_setcancelstate(cs, 0);
 	__tl_unlock();
diff --git a/src/thread/vmlock.c b/src/thread/vmlock.c
index 75f3cb76..fa0a8e3c 100644
--- a/src/thread/vmlock.c
+++ b/src/thread/vmlock.c
@@ -1,6 +1,8 @@
 #include "pthread_impl.h"
+#include "fork_impl.h"
 
 static volatile int vmlock[2];
+volatile int *const __vmlock_lockptr = vmlock;
 
 void __vm_wait()
 {
diff --git a/src/time/__map_file.c b/src/time/__map_file.c
index 9d376222..c2b29fe8 100644
--- a/src/time/__map_file.c
+++ b/src/time/__map_file.c
@@ -9,7 +9,7 @@ const char unsigned *__map_file(const char *pathname, size_t *size)
 	const unsigned char *map = MAP_FAILED;
 	int fd = sys_open(pathname, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
 	if (fd < 0) return 0;
-	if (!syscall(SYS_fstat, fd, &st)) {
+	if (!__fstat(fd, &st)) {
 		map = __mmap(0, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
 		*size = st.st_size;
 	}
diff --git a/src/time/__tz.c b/src/time/__tz.c
index 185642e8..54ed4cf6 100644
--- a/src/time/__tz.c
+++ b/src/time/__tz.c
@@ -4,8 +4,15 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
+#include <ctype.h>
 #include "libc.h"
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc undef
+#define realloc undef
+#define free undef
 
 long  __timezone = 0;
 int   __daylight = 0;
@@ -17,7 +24,6 @@ weak_alias(__tzname, tzname);
 
 static char std_name[TZNAME_MAX+1];
 static char dst_name[TZNAME_MAX+1];
-const char __utc[] = "UTC";
 
 static int dst_off;
 static int r0[5], r1[5];
@@ -30,6 +36,7 @@ static char *old_tz = old_tz_buf;
 static size_t old_tz_size = sizeof old_tz_buf;
 
 static volatile int lock[1];
+volatile int *const __timezone_lockptr = lock;
 
 static int getint(const char **p)
 {
@@ -86,15 +93,15 @@ static void getname(char *d, const char **p)
 	int i;
 	if (**p == '<') {
 		++*p;
-		for (i=0; (*p)[i]!='>' && i<TZNAME_MAX; i++)
-			d[i] = (*p)[i];
-		++*p;
+		for (i=0; (*p)[i] && (*p)[i]!='>'; i++)
+			if (i<TZNAME_MAX) d[i] = (*p)[i];
+		if ((*p)[i]) ++*p;
 	} else {
-		for (i=0; ((*p)[i]|32)-'a'<26U && i<TZNAME_MAX; i++)
-			d[i] = (*p)[i];
+		for (i=0; ((*p)[i]|32)-'a'<26U; i++)
+			if (i<TZNAME_MAX) d[i] = (*p)[i];
 	}
 	*p += i;
-	d[i] = 0;
+	d[i<TZNAME_MAX?i:TZNAME_MAX] = 0;
 }
 
 #define VEC(...) ((const unsigned char[]){__VA_ARGS__})
@@ -147,10 +154,21 @@ static void do_tzset()
 	}
 	if (old_tz) memcpy(old_tz, s, i+1);
 
+	int posix_form = 0;
+	if (*s != ':') {
+		p = s;
+		char dummy_name[TZNAME_MAX+1];
+		getname(dummy_name, &p);
+		if (p!=s && (*p == '+' || *p == '-' || isdigit(*p)
+		             || !strcmp(dummy_name, "UTC")
+		             || !strcmp(dummy_name, "GMT")))
+			posix_form = 1;
+	}	
+
 	/* Non-suid can use an absolute tzfile pathname or a relative
 	 * pathame beginning with "."; in secure mode, only the
 	 * standard path will be searched. */
-	if (*s == ':' || ((p=strchr(s, '/')) && !memchr(s, ',', p-s))) {
+	if (!posix_form) {
 		if (*s == ':') s++;
 		if (*s == '/' || *s == '.') {
 			if (!libc.secure || !strcmp(s, "/etc/localtime"))
@@ -178,7 +196,7 @@ static void do_tzset()
 	zi = map;
 	if (map) {
 		int scale = 2;
-		if (sizeof(time_t) > 4 && map[4]=='2') {
+		if (map[4]!='1') {
 			size_t skip = zi_dotprod(zi+20, VEC(1,1,8,5,6,1), 6);
 			trans = zi+skip+44+44;
 			scale++;
@@ -274,22 +292,20 @@ static size_t scan_trans(long long t, int local, size_t *alt)
 	n = (index-trans)>>scale;
 	if (a == n-1) return -1;
 	if (a == 0) {
-		x = zi_read32(trans + (a<<scale));
-		if (scale == 3) x = x<<32 | zi_read32(trans + (a<<scale) + 4);
+		x = zi_read32(trans);
+		if (scale == 3) x = x<<32 | zi_read32(trans + 4);
 		else x = (int32_t)x;
-		if (local) off = (int32_t)zi_read32(types + 6 * index[a-1]);
+		/* Find the lowest non-DST type, or 0 if none. */
+		size_t j = 0;
+		for (size_t i=abbrevs-types; i; i-=6) {
+			if (!types[i-6+4]) j = i-6;
+		}
+		if (local) off = (int32_t)zi_read32(types + j);
+		/* If t is before first transition, use the above-found type
+		 * and the index-zero (after transition) type as the alt. */
 		if (t - off < (int64_t)x) {
-			for (a=0; a<(abbrevs-types)/6; a++) {
-				if (types[6*a+4] != types[4]) break;
-			}
-			if (a == (abbrevs-types)/6) a = 0;
-			if (types[6*a+4]) {
-				*alt = a;
-				return 0;
-			} else {
-				*alt = 0;
-				return a;
-			}
+			if (alt) *alt = index[0];
+			return j/6;
 		}
 	}
 
diff --git a/src/time/__utc.c b/src/time/__utc.c
new file mode 100644
index 00000000..9e8bfc58
--- /dev/null
+++ b/src/time/__utc.c
@@ -0,0 +1,3 @@
+#include "time_impl.h"
+
+const char __utc[] = "UTC";
diff --git a/src/time/__year_to_secs.c b/src/time/__year_to_secs.c
index 2824ec6d..b42f5a6d 100644
--- a/src/time/__year_to_secs.c
+++ b/src/time/__year_to_secs.c
@@ -10,9 +10,9 @@ long long __year_to_secs(long long year, int *is_leap)
 		return 31536000*(y-70) + 86400*leaps;
 	}
 
-	int cycles, centuries, leaps, rem;
+	int cycles, centuries, leaps, rem, dummy;
 
-	if (!is_leap) is_leap = &(int){0};
+	if (!is_leap) is_leap = &dummy;
 	cycles = (year-100) / 400;
 	rem = (year-100) % 400;
 	if (rem < 0) {
diff --git a/src/time/clock_getcpuclockid.c b/src/time/clock_getcpuclockid.c
index 8a0e2d4c..bce1e8ab 100644
--- a/src/time/clock_getcpuclockid.c
+++ b/src/time/clock_getcpuclockid.c
@@ -8,6 +8,7 @@ int clock_getcpuclockid(pid_t pid, clockid_t *clk)
 	struct timespec ts;
 	clockid_t id = (-pid-1)*8U + 2;
 	int ret = __syscall(SYS_clock_getres, id, &ts);
+	if (ret == -EINVAL) ret = -ESRCH;
 	if (ret) return -ret;
 	*clk = id;
 	return 0;
diff --git a/src/time/clock_gettime.c b/src/time/clock_gettime.c
index 3e1d0975..4d2ec22f 100644
--- a/src/time/clock_gettime.c
+++ b/src/time/clock_gettime.c
@@ -42,6 +42,9 @@ static int cgt_init(clockid_t clk, struct timespec *ts)
 			p = cgt_time32_wrap;
 		}
 	}
+#ifdef VDSO_CGT_WORKAROUND
+	if (!__vdsosym(VDSO_CGT32_VER, VDSO_CGT32_SYM)) p = 0;
+#endif
 #endif
 	int (*f)(clockid_t, struct timespec *) =
 		(int (*)(clockid_t, struct timespec *))p;
@@ -80,10 +83,12 @@ int __clock_gettime(clockid_t clk, struct timespec *ts)
 		return __syscall_ret(r);
 	long ts32[2];
 	r = __syscall(SYS_clock_gettime, clk, ts32);
+#ifdef SYS_gettimeofday
 	if (r==-ENOSYS && clk==CLOCK_REALTIME) {
 		r = __syscall(SYS_gettimeofday, ts32, 0);
 		ts32[1] *= 1000;
 	}
+#endif
 	if (!r) {
 		ts->tv_sec = ts32[0];
 		ts->tv_nsec = ts32[1];
@@ -92,6 +97,7 @@ int __clock_gettime(clockid_t clk, struct timespec *ts)
 	return __syscall_ret(r);
 #else
 	r = __syscall(SYS_clock_gettime, clk, ts);
+#ifdef SYS_gettimeofday
 	if (r == -ENOSYS) {
 		if (clk == CLOCK_REALTIME) {
 			__syscall(SYS_gettimeofday, ts, 0);
@@ -100,6 +106,7 @@ int __clock_gettime(clockid_t clk, struct timespec *ts)
 		}
 		r = -EINVAL;
 	}
+#endif
 	return __syscall_ret(r);
 #endif
 }
diff --git a/src/time/strftime.c b/src/time/strftime.c
index cc53d536..c40246db 100644
--- a/src/time/strftime.c
+++ b/src/time/strftime.c
@@ -3,6 +3,7 @@
 #include <string.h>
 #include <langinfo.h>
 #include <locale.h>
+#include <ctype.h>
 #include <time.h>
 #include <limits.h>
 #include "locale_impl.h"
@@ -233,7 +234,12 @@ size_t __strftime_l(char *restrict s, size_t n, const char *restrict f, const st
 		pad = 0;
 		if (*f == '-' || *f == '_' || *f == '0') pad = *f++;
 		if ((plus = (*f == '+'))) f++;
-		width = strtoul(f, &p, 10);
+		if (isdigit(*f)) {
+			width = strtoul(f, &p, 10);
+		} else {
+			width = 0;
+			p = (void *)f;
+		}
 		if (*p == 'C' || *p == 'F' || *p == 'G' || *p == 'Y') {
 			if (!width && p!=f) width = 1;
 		} else {
diff --git a/src/time/strptime.c b/src/time/strptime.c
index c54a0d8c..b1147242 100644
--- a/src/time/strptime.c
+++ b/src/time/strptime.c
@@ -59,6 +59,22 @@ char *strptime(const char *restrict s, const char *restrict f, struct tm *restri
 			s = strptime(s, "%m/%d/%y", tm);
 			if (!s) return 0;
 			break;
+		case 'F':
+			/* Use temp buffer to implement the odd requirement
+			 * that entire field be width-limited but the year
+			 * subfield not itself be limited. */
+			i = 0;
+			char tmp[20];
+			if (*s == '-' || *s == '+') tmp[i++] = *s++;
+			while (*s=='0' && isdigit(s[1])) s++;
+			for (; *s && i<(size_t)w && i+1<sizeof tmp; i++) {
+				tmp[i] = *s++;
+			}
+			tmp[i] = 0;
+			char *p = strptime(tmp, "%12Y-%m-%d", tm);
+			if (!p) return 0;
+			s -= tmp+i-p;
+			break;
 		case 'H':
 			dest = &tm->tm_hour;
 			min = 0;
@@ -114,6 +130,13 @@ char *strptime(const char *restrict s, const char *restrict f, struct tm *restri
 			s = strptime(s, "%H:%M", tm);
 			if (!s) return 0;
 			break;
+		case 's':
+			/* Parse only. Effect on tm is unspecified
+			 * and presently no effect is implemented.. */
+			if (*s == '-') s++;
+			if (!isdigit(*s)) return 0;
+			while (isdigit(*s)) s++;
+			break;
 		case 'S':
 			dest = &tm->tm_sec;
 			min = 0;
@@ -125,11 +148,30 @@ char *strptime(const char *restrict s, const char *restrict f, struct tm *restri
 			break;
 		case 'U':
 		case 'W':
-			/* Throw away result, for now. (FIXME?) */
+			/* Throw away result of %U, %V, %W, %g, and %G. Effect
+			 * is unspecified and there is no clear right choice. */
 			dest = &dummy;
 			min = 0;
 			range = 54;
 			goto numeric_range;
+		case 'V':
+			dest = &dummy;
+			min = 1;
+			range = 53;
+			goto numeric_range;
+		case 'g':
+			dest = &dummy;
+			w = 2;
+			goto numeric_digits;
+		case 'G':
+			dest = &dummy;
+			if (w<0) w=4;
+			goto numeric_digits;
+		case 'u':
+			dest = &tm->tm_wday;
+			min = 1;
+			range = 7;
+			goto numeric_range;
 		case 'w':
 			dest = &tm->tm_wday;
 			min = 0;
@@ -154,6 +196,28 @@ char *strptime(const char *restrict s, const char *restrict f, struct tm *restri
 			adj = 1900;
 			want_century = 0;
 			goto numeric_digits;
+		case 'z':
+			if (*s == '+') neg = 0;
+			else if (*s == '-') neg = 1;
+			else return 0;
+			for (i=0; i<4; i++) if (!isdigit(s[1+i])) return 0;
+			tm->__tm_gmtoff = (s[1]-'0')*36000+(s[2]-'0')*3600
+				+ (s[3]-'0')*600 + (s[4]-'0')*60;
+			if (neg) tm->__tm_gmtoff = -tm->__tm_gmtoff;
+			s += 5;
+			break;
+		case 'Z':
+			if (!strncmp(s, tzname[0], len = strlen(tzname[0]))) {
+				tm->tm_isdst = 0;
+				s += len;
+			} else if (!strncmp(s, tzname[1], len=strlen(tzname[1]))) {
+				tm->tm_isdst = 1;
+				s += len;
+			} else {
+				/* FIXME: is this supposed to be an error? */
+				while ((*s|32)-'a' <= 'z'-'a') s++;
+			}
+			break;
 		case '%':
 			if (*s++ != '%') return 0;
 			break;
diff --git a/src/time/timer_create.c b/src/time/timer_create.c
index 455d49fc..cc6c2236 100644
--- a/src/time/timer_create.c
+++ b/src/time/timer_create.c
@@ -1,7 +1,9 @@
 #include <time.h>
 #include <setjmp.h>
 #include <limits.h>
+#include <semaphore.h>
 #include "pthread_impl.h"
+#include "atomic.h"
 
 struct ksigevent {
 	union sigval sigev_value;
@@ -11,7 +13,7 @@ struct ksigevent {
 };
 
 struct start_args {
-	pthread_barrier_t b;
+	sem_t sem1, sem2;
 	struct sigevent *sev;
 };
 
@@ -20,10 +22,16 @@ static void dummy_0()
 }
 weak_alias(dummy_0, __pthread_tsd_run_dtors);
 
+static void timer_handler(int sig, siginfo_t *si, void *ctx)
+{
+}
+
 static void cleanup_fromsig(void *p)
 {
 	pthread_t self = __pthread_self();
 	__pthread_tsd_run_dtors();
+	__block_app_sigs(0);
+	__syscall(SYS_rt_sigprocmask, SIG_BLOCK, SIGTIMER_SET, 0, _NSIG/8);
 	self->cancel = 0;
 	self->cancelbuf = 0;
 	self->canceldisable = 0;
@@ -32,19 +40,6 @@ static void cleanup_fromsig(void *p)
 	longjmp(p, 1);
 }
 
-static void timer_handler(int sig, siginfo_t *si, void *ctx)
-{
-}
-
-static void install_handler()
-{
-	struct sigaction sa = {
-		.sa_sigaction = timer_handler,
-		.sa_flags = SA_SIGINFO | SA_RESTART
-	};
-	__libc_sigaction(SIGTIMER, &sa, 0);
-}
-
 static void *start(void *arg)
 {
 	pthread_t self = __pthread_self();
@@ -54,7 +49,16 @@ static void *start(void *arg)
 	void (*notify)(union sigval) = args->sev->sigev_notify_function;
 	union sigval val = args->sev->sigev_value;
 
-	pthread_barrier_wait(&args->b);
+	/* The two-way semaphore synchronization ensures that we see
+	 * self->cancel set by the parent if timer creation failed or
+	 * self->timer_id if it succeeded, and informs the parent that
+	 * we are done accessing the arguments so that the parent can
+	 * proceed past their block lifetime. */
+	while (sem_wait(&args->sem1));
+	sem_post(&args->sem2);
+
+	if (self->cancel)
+		return 0;
 	for (;;) {
 		siginfo_t si;
 		while (sigwaitinfo(SIGTIMER_SET, &si) < 0);
@@ -71,7 +75,7 @@ static void *start(void *arg)
 
 int timer_create(clockid_t clk, struct sigevent *restrict evp, timer_t *restrict res)
 {
-	static pthread_once_t once = PTHREAD_ONCE_INIT;
+	static volatile int init = 0;
 	pthread_t td;
 	pthread_attr_t attr;
 	int r;
@@ -83,11 +87,15 @@ int timer_create(clockid_t clk, struct sigevent *restrict evp, timer_t *restrict
 	switch (evp ? evp->sigev_notify : SIGEV_SIGNAL) {
 	case SIGEV_NONE:
 	case SIGEV_SIGNAL:
+	case SIGEV_THREAD_ID:
 		if (evp) {
 			ksev.sigev_value = evp->sigev_value;
 			ksev.sigev_signo = evp->sigev_signo;
 			ksev.sigev_notify = evp->sigev_notify;
-			ksev.sigev_tid = 0;
+			if (evp->sigev_notify == SIGEV_THREAD_ID)
+				ksev.sigev_tid = evp->sigev_notify_thread_id;
+			else
+				ksev.sigev_tid = 0;
 			ksevp = &ksev;
 		}
 		if (syscall(SYS_timer_create, clk, ksevp, &timerid) < 0)
@@ -95,13 +103,21 @@ int timer_create(clockid_t clk, struct sigevent *restrict evp, timer_t *restrict
 		*res = (void *)(intptr_t)timerid;
 		break;
 	case SIGEV_THREAD:
-		pthread_once(&once, install_handler);
+		if (!init) {
+			struct sigaction sa = {
+				.sa_sigaction = timer_handler,
+				.sa_flags = SA_SIGINFO | SA_RESTART
+			};
+			__libc_sigaction(SIGTIMER, &sa, 0);
+			a_store(&init, 1);
+		}
 		if (evp->sigev_notify_attributes)
 			attr = *evp->sigev_notify_attributes;
 		else
 			pthread_attr_init(&attr);
 		pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
-		pthread_barrier_init(&args.b, 0, 2);
+		sem_init(&args.sem1, 0, 0);
+		sem_init(&args.sem2, 0, 0);
 		args.sev = evp;
 
 		__block_app_sigs(&set);
@@ -115,12 +131,15 @@ int timer_create(clockid_t clk, struct sigevent *restrict evp, timer_t *restrict
 
 		ksev.sigev_value.sival_ptr = 0;
 		ksev.sigev_signo = SIGTIMER;
-		ksev.sigev_notify = 4; /* SIGEV_THREAD_ID */
+		ksev.sigev_notify = SIGEV_THREAD_ID;
 		ksev.sigev_tid = td->tid;
-		if (syscall(SYS_timer_create, clk, &ksev, &timerid) < 0)
+		if (syscall(SYS_timer_create, clk, &ksev, &timerid) < 0) {
 			timerid = -1;
+			td->cancel = 1;
+		}
 		td->timer_id = timerid;
-		pthread_barrier_wait(&args.b);
+		sem_post(&args.sem1);
+		while (sem_wait(&args.sem2));
 		if (timerid < 0) return -1;
 		*res = (void *)(INTPTR_MIN | (uintptr_t)td>>1);
 		break;
diff --git a/src/unistd/close.c b/src/unistd/close.c
index 5b38e019..a2105f50 100644
--- a/src/unistd/close.c
+++ b/src/unistd/close.c
@@ -1,5 +1,6 @@
 #include <unistd.h>
 #include <errno.h>
+#include "aio_impl.h"
 #include "syscall.h"
 
 static int dummy(int fd)
diff --git a/src/unistd/dup3.c b/src/unistd/dup3.c
index f919f791..40798bde 100644
--- a/src/unistd/dup3.c
+++ b/src/unistd/dup3.c
@@ -9,12 +9,14 @@ int __dup3(int old, int new, int flags)
 	int r;
 #ifdef SYS_dup2
 	if (old==new) return __syscall_ret(-EINVAL);
-	if (flags & O_CLOEXEC) {
+	if (flags) {
 		while ((r=__syscall(SYS_dup3, old, new, flags))==-EBUSY);
 		if (r!=-ENOSYS) return __syscall_ret(r);
+		if (flags & ~O_CLOEXEC) return __syscall_ret(-EINVAL);
 	}
 	while ((r=__syscall(SYS_dup2, old, new))==-EBUSY);
-	if (flags & O_CLOEXEC) __syscall(SYS_fcntl, new, F_SETFD, FD_CLOEXEC);
+	if (r >= 0 && (flags & O_CLOEXEC))
+		__syscall(SYS_fcntl, new, F_SETFD, FD_CLOEXEC);
 #else
 	while ((r=__syscall(SYS_dup3, old, new, flags))==-EBUSY);
 #endif
diff --git a/src/unistd/faccessat.c b/src/unistd/faccessat.c
index 76bbd4c7..43052dd7 100644
--- a/src/unistd/faccessat.c
+++ b/src/unistd/faccessat.c
@@ -25,12 +25,17 @@ static int checker(void *p)
 
 int faccessat(int fd, const char *filename, int amode, int flag)
 {
-	if (!flag || (flag==AT_EACCESS && getuid()==geteuid() && getgid()==getegid()))
-		return syscall(SYS_faccessat, fd, filename, amode, flag);
+	if (flag) {
+		int ret = __syscall(SYS_faccessat2, fd, filename, amode, flag);
+		if (ret != -ENOSYS) return __syscall_ret(ret);
+	}
 
-	if (flag != AT_EACCESS)
+	if (flag & ~AT_EACCESS)
 		return __syscall_ret(-EINVAL);
 
+	if (!flag || (getuid()==geteuid() && getgid()==getegid()))
+		return syscall(SYS_faccessat, fd, filename, amode);
+
 	char stack[1024];
 	sigset_t set;
 	pid_t pid;
@@ -48,7 +53,7 @@ int faccessat(int fd, const char *filename, int amode, int flag)
 	if (pid<0 || __syscall(SYS_read, p[0], &ret, sizeof ret) != sizeof(ret))
 		ret = -EBUSY;
 	__syscall(SYS_close, p[0]);
-	__syscall(SYS_wait4, pid, &status, __WCLONE, 0);
+	__sys_wait4(pid, &status, __WCLONE, 0);
 
 	__restore_sigs(&set);
 
diff --git a/src/unistd/ftruncate.c b/src/unistd/ftruncate.c
index b41be0fa..54ff34bc 100644
--- a/src/unistd/ftruncate.c
+++ b/src/unistd/ftruncate.c
@@ -5,5 +5,3 @@ int ftruncate(int fd, off_t length)
 {
 	return syscall(SYS_ftruncate, fd, __SYSCALL_LL_O(length));
 }
-
-weak_alias(ftruncate, ftruncate64);
diff --git a/src/unistd/isatty.c b/src/unistd/isatty.c
index 75a9c186..21222eda 100644
--- a/src/unistd/isatty.c
+++ b/src/unistd/isatty.c
@@ -6,8 +6,6 @@
 int isatty(int fd)
 {
 	struct winsize wsz;
-	unsigned long r = syscall(SYS_ioctl, fd, TIOCGWINSZ, &wsz);
-	if (r == 0) return 1;
-	if (errno != EBADF) errno = ENOTTY;
-	return 0;
+	/* +1 converts from error status (0/-1) to boolean (1/0) */
+	return syscall(SYS_ioctl, fd, TIOCGWINSZ, &wsz) + 1;
 }
diff --git a/src/unistd/lseek.c b/src/unistd/lseek.c
index b4984f3e..f5b66682 100644
--- a/src/unistd/lseek.c
+++ b/src/unistd/lseek.c
@@ -12,4 +12,3 @@ off_t __lseek(int fd, off_t offset, int whence)
 }
 
 weak_alias(__lseek, lseek);
-weak_alias(__lseek, lseek64);
diff --git a/src/unistd/mipsn32/lseek.c b/src/unistd/mipsn32/lseek.c
index 60e74a51..0f6cbcaa 100644
--- a/src/unistd/mipsn32/lseek.c
+++ b/src/unistd/mipsn32/lseek.c
@@ -17,4 +17,3 @@ off_t __lseek(int fd, off_t offset, int whence)
 }
 
 weak_alias(__lseek, lseek);
-weak_alias(__lseek, lseek64);
diff --git a/src/unistd/nice.c b/src/unistd/nice.c
index 6c25c8c3..1c2295ff 100644
--- a/src/unistd/nice.c
+++ b/src/unistd/nice.c
@@ -1,4 +1,5 @@
 #include <unistd.h>
+#include <errno.h>
 #include <sys/resource.h>
 #include <limits.h>
 #include "syscall.h"
@@ -12,5 +13,11 @@ int nice(int inc)
 		prio += getpriority(PRIO_PROCESS, 0);
 	if (prio > NZERO-1) prio = NZERO-1;
 	if (prio < -NZERO) prio = -NZERO;
-	return setpriority(PRIO_PROCESS, 0, prio) ? -1 : prio;
+	if (setpriority(PRIO_PROCESS, 0, prio)) {
+		if (errno == EACCES)
+			errno = EPERM;
+		return -1;
+	} else {
+		return prio;
+	}
 }
diff --git a/src/unistd/pause.c b/src/unistd/pause.c
index 90bbf4ca..90cc8db5 100644
--- a/src/unistd/pause.c
+++ b/src/unistd/pause.c
@@ -3,9 +3,5 @@
 
 int pause(void)
 {
-#ifdef SYS_pause
-	return syscall_cp(SYS_pause);
-#else
-	return syscall_cp(SYS_ppoll, 0, 0, 0, 0);
-#endif
+	return sys_pause_cp();
 }
diff --git a/src/unistd/pipe2.c b/src/unistd/pipe2.c
index f24f74fb..a096990b 100644
--- a/src/unistd/pipe2.c
+++ b/src/unistd/pipe2.c
@@ -8,6 +8,7 @@ int pipe2(int fd[2], int flag)
 	if (!flag) return pipe(fd);
 	int ret = __syscall(SYS_pipe2, fd, flag);
 	if (ret != -ENOSYS) return __syscall_ret(ret);
+	if (flag & ~(O_CLOEXEC|O_NONBLOCK)) return __syscall_ret(-EINVAL);
 	ret = pipe(fd);
 	if (ret) return ret;
 	if (flag & O_CLOEXEC) {
diff --git a/src/unistd/pread.c b/src/unistd/pread.c
index 5681b045..b03fb0ad 100644
--- a/src/unistd/pread.c
+++ b/src/unistd/pread.c
@@ -5,5 +5,3 @@ ssize_t pread(int fd, void *buf, size_t size, off_t ofs)
 {
 	return syscall_cp(SYS_pread, fd, buf, size, __SYSCALL_LL_PRW(ofs));
 }
-
-weak_alias(pread, pread64);
diff --git a/src/unistd/preadv.c b/src/unistd/preadv.c
index 8376d60f..890ab403 100644
--- a/src/unistd/preadv.c
+++ b/src/unistd/preadv.c
@@ -8,5 +8,3 @@ ssize_t preadv(int fd, const struct iovec *iov, int count, off_t ofs)
 	return syscall_cp(SYS_preadv, fd, iov, count,
 		(long)(ofs), (long)(ofs>>32));
 }
-
-weak_alias(preadv, preadv64);
diff --git a/src/unistd/pwrite.c b/src/unistd/pwrite.c
index ca376576..a008b3ec 100644
--- a/src/unistd/pwrite.c
+++ b/src/unistd/pwrite.c
@@ -1,9 +1,18 @@
+#define _GNU_SOURCE
 #include <unistd.h>
+#include <sys/uio.h>
+#include <fcntl.h>
 #include "syscall.h"
 
 ssize_t pwrite(int fd, const void *buf, size_t size, off_t ofs)
 {
+	if (ofs == -1) ofs--;
+	int r = __syscall_cp(SYS_pwritev2, fd,
+		(&(struct iovec){ .iov_base = (void *)buf, .iov_len = size }),
+		1, (long)(ofs), (long)(ofs>>32), RWF_NOAPPEND);
+	if (r != -EOPNOTSUPP && r != -ENOSYS)
+		return __syscall_ret(r);
+	if (fcntl(fd, F_GETFL) & O_APPEND)
+		return __syscall_ret(-EOPNOTSUPP);
 	return syscall_cp(SYS_pwrite, fd, buf, size, __SYSCALL_LL_PRW(ofs));
 }
-
-weak_alias(pwrite, pwrite64);
diff --git a/src/unistd/pwritev.c b/src/unistd/pwritev.c
index f5a612c4..44a53d85 100644
--- a/src/unistd/pwritev.c
+++ b/src/unistd/pwritev.c
@@ -1,12 +1,18 @@
-#define _BSD_SOURCE
+#define _GNU_SOURCE
 #include <sys/uio.h>
 #include <unistd.h>
+#include <fcntl.h>
 #include "syscall.h"
 
 ssize_t pwritev(int fd, const struct iovec *iov, int count, off_t ofs)
 {
+	if (ofs == -1) ofs--;
+	int r = __syscall_cp(SYS_pwritev2, fd, iov, count,
+		(long)(ofs), (long)(ofs>>32), RWF_NOAPPEND);
+	if (r != -EOPNOTSUPP && r != -ENOSYS)
+		return __syscall_ret(r);
+	if (fcntl(fd, F_GETFL) & O_APPEND)
+		return __syscall_ret(-EOPNOTSUPP);
 	return syscall_cp(SYS_pwritev, fd, iov, count,
 		(long)(ofs), (long)(ofs>>32));
 }
-
-weak_alias(pwritev, pwritev64);
diff --git a/src/unistd/readlink.c b/src/unistd/readlink.c
index a152d524..32f4537f 100644
--- a/src/unistd/readlink.c
+++ b/src/unistd/readlink.c
@@ -4,9 +4,16 @@
 
 ssize_t readlink(const char *restrict path, char *restrict buf, size_t bufsize)
 {
+	char dummy[1];
+	if (!bufsize) {
+		buf = dummy;
+		bufsize = 1;
+	}
 #ifdef SYS_readlink
-	return syscall(SYS_readlink, path, buf, bufsize);
+	int r = __syscall(SYS_readlink, path, buf, bufsize);
 #else
-	return syscall(SYS_readlinkat, AT_FDCWD, path, buf, bufsize);
+	int r = __syscall(SYS_readlinkat, AT_FDCWD, path, buf, bufsize);
 #endif
+	if (buf == dummy && r > 0) r = 0;
+	return __syscall_ret(r);
 }
diff --git a/src/unistd/readlinkat.c b/src/unistd/readlinkat.c
index 9af45cd5..f79d3d14 100644
--- a/src/unistd/readlinkat.c
+++ b/src/unistd/readlinkat.c
@@ -3,5 +3,12 @@
 
 ssize_t readlinkat(int fd, const char *restrict path, char *restrict buf, size_t bufsize)
 {
-	return syscall(SYS_readlinkat, fd, path, buf, bufsize);
+	char dummy[1];
+	if (!bufsize) {
+		buf = dummy;
+		bufsize = 1;
+	}
+	int r = __syscall(SYS_readlinkat, fd, path, buf, bufsize);
+	if (buf == dummy && r > 0) r = 0;
+	return __syscall_ret(r);
 }
diff --git a/src/unistd/setxid.c b/src/unistd/setxid.c
index 0239f8af..a629ed4b 100644
--- a/src/unistd/setxid.c
+++ b/src/unistd/setxid.c
@@ -1,20 +1,19 @@
 #include <unistd.h>
-#include <errno.h>
+#include <signal.h>
 #include "syscall.h"
 #include "libc.h"
-#include "pthread_impl.h"
 
 struct ctx {
 	int id, eid, sid;
-	int nr, err;
+	int nr, ret;
 };
 
 static void do_setxid(void *p)
 {
 	struct ctx *c = p;
-	if (c->err>0) return;
-	int ret = -__syscall(c->nr, c->id, c->eid, c->sid);
-	if (ret && !c->err) {
+	if (c->ret<0) return;
+	int ret = __syscall(c->nr, c->id, c->eid, c->sid);
+	if (ret && !c->ret) {
 		/* If one thread fails to set ids after another has already
 		 * succeeded, forcibly killing the process is the only safe
 		 * thing to do. State is inconsistent and dangerous. Use
@@ -22,18 +21,14 @@ static void do_setxid(void *p)
 		__block_all_sigs(0);
 		__syscall(SYS_kill, __syscall(SYS_getpid), SIGKILL);
 	}
-	c->err = ret;
+	c->ret = ret;
 }
 
 int __setxid(int nr, int id, int eid, int sid)
 {
-	/* err is initially nonzero so that failure of the first thread does not
+	/* ret is initially nonzero so that failure of the first thread does not
 	 * trigger the safety kill above. */
-	struct ctx c = { .nr = nr, .id = id, .eid = eid, .sid = sid, .err = -1 };
+	struct ctx c = { .nr = nr, .id = id, .eid = eid, .sid = sid, .ret = 1 };
 	__synccall(do_setxid, &c);
-	if (c.err) {
-		if (c.err>0) errno = c.err;
-		return -1;
-	}
-	return 0;
+	return __syscall_ret(c.ret > 0 ? -EAGAIN : c.ret);
 }
diff --git a/src/unistd/truncate.c b/src/unistd/truncate.c
index 97296800..077351e1 100644
--- a/src/unistd/truncate.c
+++ b/src/unistd/truncate.c
@@ -5,5 +5,3 @@ int truncate(const char *path, off_t length)
 {
 	return syscall(SYS_truncate, path, __SYSCALL_LL_O(length));
 }
-
-weak_alias(truncate, truncate64);
diff --git a/src/unistd/x32/lseek.c b/src/unistd/x32/lseek.c
index 32636429..5f93292f 100644
--- a/src/unistd/x32/lseek.c
+++ b/src/unistd/x32/lseek.c
@@ -12,4 +12,3 @@ off_t __lseek(int fd, off_t offset, int whence)
 }
 
 weak_alias(__lseek, lseek);
-weak_alias(__lseek, lseek64);
diff --git a/tools/install.sh b/tools/install.sh
index d913b60b..855a8ca2 100755
--- a/tools/install.sh
+++ b/tools/install.sh
@@ -48,7 +48,9 @@ trap 'rm -f "$tmp"' EXIT INT QUIT TERM HUP
 umask 077
 
 if test "$symlink" ; then
+umask 000
 ln -s "$1" "$tmp"
+umask 077
 else
 cat < "$1" > "$tmp"
 chmod "$mode" "$tmp"